diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index e8940338d0..973e2af34b 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -17,13 +17,13 @@
 ################################################################################
 
 PKG_NAME="kodi"
-PKG_VERSION="16.1-rc2-a7caa16"
+PKG_VERSION="17.0-alpha1-2c72ac9"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="$DISTRO_SRC/$PKG_NAME-$PKG_VERSION.tar.xz"
-PKG_DEPENDS_TARGET="toolchain kodi:host xmlstarlet:host libsquish boost Python zlib bzip2 systemd pciutils lzo pcre swig:host libass curl rtmpdump fontconfig fribidi tinyxml libjpeg-turbo libpng tiff freetype jasper libogg libcdio libmpeg2 taglib libxml2 libxslt yajl sqlite libvorbis ffmpeg crossguid giflib"
+PKG_DEPENDS_TARGET="toolchain kodi:host xmlstarlet:host libsquish Python zlib bzip2 systemd pciutils lzo pcre swig:host libass curl rtmpdump fontconfig fribidi tinyxml libjpeg-turbo libpng freetype libogg libcdio taglib libxml2 libxslt yajl sqlite libvorbis ffmpeg crossguid giflib"
 PKG_DEPENDS_HOST="lzo:host libpng:host libjpeg-turbo:host giflib:host"
 PKG_PRIORITY="optional"
 PKG_SECTION="mediacenter"
@@ -49,7 +49,7 @@ fi
 
 if [ ! "$OPENGL" = "no" ]; then
 # for OpenGL (GLX) support
-  PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET $OPENGL glu glew"
+  PKG_DEPENDS_TARGET="$PKG_DEPENDS_TARGET $OPENGL glu"
   KODI_OPENGL="--enable-gl"
 else
   KODI_OPENGL="--disable-gl"
@@ -228,14 +228,14 @@ export CXXFLAGS_FOR_BUILD="$HOST_CXXFLAGS"
 export CFLAGS_FOR_BUILD="$HOST_CFLAGS"
 export LDFLAGS_FOR_BUILD="$HOST_LDFLAGS"
 
-export PYTHON_VERSION="2.7"
+export PYTHON_VERSION=2.7
 export PYTHON_CPPFLAGS="-I$SYSROOT_PREFIX/usr/include/python$PYTHON_VERSION"
 export PYTHON_LDFLAGS="-L$SYSROOT_PREFIX/usr/lib/python$PYTHON_VERSION -lpython$PYTHON_VERSION"
 export PYTHON_SITE_PKG="$SYSROOT_PREFIX/usr/lib/python$PYTHON_VERSION/site-packages"
-export ac_python_version="$PYTHON_VERSION"
 
 PKG_CONFIGURE_OPTS_TARGET="gl_cv_func_gettimeofday_clobber=no \
-                           ac_cv_lib_bluetooth_hci_devid=no \
+                           ac_python_version=$PYTHON_VERSION \
+                           --disable-libbluetooth \
                            --disable-debug \
                            --disable-optimizations \
                            $KODI_OPENGL \
@@ -279,31 +279,41 @@ PKG_CONFIGURE_OPTS_TARGET="gl_cv_func_gettimeofday_clobber=no \
 
 pre_configure_host() {
 # kodi fails to build in subdirs
-  cd $ROOT/$PKG_BUILD
-    rm -rf .$HOST_NAME
+  rm -rf $ROOT/$PKG_BUILD/.$HOST_NAME
+}
+
+configure_host() {
+  : # not needed
 }
 
 make_host() {
-  make -C tools/depends/native/JsonSchemaBuilder
-  make -C tools/depends/native/TexturePacker
+  mkdir -p $ROOT/$PKG_BUILD/tools/depends/native/JsonSchemaBuilder/bin && cd $_
+  cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CONF \
+        -DCMAKE_INSTALL_PREFIX=/usr \
+        ..
+  make
+  mkdir -p $ROOT/$PKG_BUILD/tools/depends/native/TexturePacker/bin && cd $_
+  cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CONF \
+        -DCMAKE_INSTALL_PREFIX=/usr \
+        -DCORE_SOURCE_DIR=$ROOT/$PKG_BUILD \
+        -DCMAKE_CXX_FLAGS="-std=c++11 -DTARGET_POSIX -DTARGET_LINUX -D_LINUX -I$ROOT/$PKG_BUILD/xbmc/linux" \
+        ..
+  make
 }
 
 makeinstall_host() {
-  cp -PR tools/depends/native/JsonSchemaBuilder/native/JsonSchemaBuilder $ROOT/$TOOLCHAIN/bin
-  rm -f $ROOT/$TOOLCHAIN/bin/TexturePacker
-  cp -PR tools/depends/native/TexturePacker/native/TexturePacker $ROOT/$TOOLCHAIN/bin
+  cp -P $ROOT/$PKG_BUILD/tools/depends/native/TexturePacker/bin/TexturePacker $ROOT/$TOOLCHAIN/bin
 }
 
 pre_build_target() {
 # adding fake Makefile for stripped skin
-  mkdir -p $PKG_BUILD/addons/skin.estuary/media
-  touch $PKG_BUILD/addons/skin.estuary/media/Makefile.in
+  mkdir -p $ROOT/$PKG_BUILD/addons/skin.estuary/media
+  touch $ROOT/$PKG_BUILD/addons/skin.estuary/media/Makefile.in
 }
 
 pre_configure_target() {
 # kodi fails to build in subdirs
-  cd $ROOT/$PKG_BUILD
-    rm -rf .$TARGET_NAME
+  rm -rf $ROOT/$PKG_BUILD/.$TARGET_NAME
 
 # kodi should never be built with lto
   strip_lto
@@ -313,6 +323,9 @@ pre_configure_target() {
   export LIBS="$LIBS -lz"
 
   export JSON_BUILDER=$ROOT/$TOOLCHAIN/bin/JsonSchemaBuilder
+
+# autoreconf
+  BOOTSTRAP_STANDALONE=1 make -f $ROOT/$PKG_BUILD/bootstrap.mk
 }
 
 make_target() {
@@ -337,9 +350,12 @@ post_makeinstall_target() {
   rm -rf $INSTALL/usr/bin/xbmc
   rm -rf $INSTALL/usr/bin/xbmc-standalone
   rm -rf $INSTALL/usr/lib/kodi/*.cmake
-
-  # more binaddons cross compile badness meh
-  sed -i -e "s:INCLUDE_DIR /usr/include/kodi:INCLUDE_DIR $SYSROOT_PREFIX/usr/include/kodi:g" $SYSROOT_PREFIX/usr/lib/kodi/kodi-config.cmake
+  rm -rf $INSTALL/usr/share/applications
+  rm -rf $INSTALL/usr/share/icons
+  rm -rf $INSTALL/usr/share/kodi/addons/skin.estouchy
+  rm -rf $INSTALL/usr/share/kodi/addons/service.xbmc.versioncheck
+  rm -rf $INSTALL/usr/share/kodi/addons/visualization.vortex
+  rm -rf $INSTALL/usr/share/xsessions
 
   mkdir -p $INSTALL/usr/lib/kodi
     cp $PKG_DIR/scripts/kodi-config $INSTALL/usr/lib/kodi
@@ -358,12 +374,6 @@ post_makeinstall_target() {
     rm -rf $INSTALL/usr/lib/kodi/kodi-xrandr
   fi
 
-  rm -rf $INSTALL/usr/share/applications
-  rm -rf $INSTALL/usr/share/icons
-  rm -rf $INSTALL/usr/share/kodi/addons/service.xbmc.versioncheck
-  rm -rf $INSTALL/usr/share/kodi/addons/visualization.vortex
-  rm -rf $INSTALL/usr/share/xsessions
-
   mkdir -p $INSTALL/usr/share/kodi/addons
     cp -R $PKG_DIR/config/os.openelec.tv $INSTALL/usr/share/kodi/addons
     $SED "s|@OS_VERSION@|$OS_VERSION|g" -i $INSTALL/usr/share/kodi/addons/os.openelec.tv/addon.xml
@@ -372,8 +382,8 @@ post_makeinstall_target() {
     cp -R $PKG_DIR/config/repository.libreelec.tv $INSTALL/usr/share/kodi/addons
     $SED "s|@ADDON_URL@|$ADDON_URL|g" -i $INSTALL/usr/share/kodi/addons/repository.libreelec.tv/addon.xml
 
-  mkdir -p $INSTALL/usr/lib/python"$PYTHON_VERSION"/site-packages/kodi
-    cp -R tools/EventClients/lib/python/* $INSTALL/usr/lib/python"$PYTHON_VERSION"/site-packages/kodi
+  mkdir -p $INSTALL/usr/lib/python$PYTHON_VERSION/site-packages/kodi
+    cp -R tools/EventClients/lib/python/* $INSTALL/usr/lib/python$PYTHON_VERSION/site-packages/kodi
 
   mkdir -p $INSTALL/usr/share/kodi/config
     cp $PKG_DIR/config/guisettings.xml $INSTALL/usr/share/kodi/config
@@ -402,6 +412,19 @@ post_makeinstall_target() {
       cp $PKG_DIR/config/appliance.xml $INSTALL/usr/share/kodi/system/settings
     fi
 
+  # update addon manifest
+  ADDON_MANIFEST=$INSTALL/usr/share/kodi/system/addon-manifest.xml
+  xmlstarlet ed -L -d "/addons/addon[text()='service.xbmc.versioncheck']" $ADDON_MANIFEST
+  xmlstarlet ed -L -d "/addons/addon[text()='skin.estouchy']" $ADDON_MANIFEST
+  xmlstarlet ed -L --subnode "/addons" -t elem -n "addon" -v "peripheral.joystick" $ADDON_MANIFEST
+  xmlstarlet ed -L --subnode "/addons" -t elem -n "addon" -v "os.libreelec.tv" $ADDON_MANIFEST
+  xmlstarlet ed -L --subnode "/addons" -t elem -n "addon" -v "repository.libreelec.tv" $ADDON_MANIFEST
+  xmlstarlet ed -L --subnode "/addons" -t elem -n "addon" -v "service.libreelec.settings" $ADDON_MANIFEST
+
+  # more binaddons cross compile badness meh
+    sed -i -e "s:INCLUDE_DIR /usr/include/kodi:INCLUDE_DIR $SYSROOT_PREFIX/usr/include/kodi:g" $SYSROOT_PREFIX/usr/lib/kodi/kodi-config.cmake
+
+
   if [ "$KODI_EXTRA_FONTS" = yes ]; then
     mkdir -p $INSTALL/usr/share/kodi/media/Fonts
       cp $PKG_DIR/fonts/*.ttf $INSTALL/usr/share/kodi/media/Fonts
diff --git a/packages/mediacenter/kodi/patches/kodi-100.10-handle-SIGTERM.patch b/packages/mediacenter/kodi/patches/kodi-100.10-handle-SIGTERM.patch
index 8de57f0ce0..a297f34eed 100644
--- a/packages/mediacenter/kodi/patches/kodi-100.10-handle-SIGTERM.patch
+++ b/packages/mediacenter/kodi/patches/kodi-100.10-handle-SIGTERM.patch
@@ -121,14 +121,14 @@ index c46cba1..ed3f35f 100644
    bool m_AppFocused;
    bool m_renderGUI;
  
-diff --git a/xbmc/main/main.cpp b/xbmc/main/main.cpp
+diff --git a/xbmc/platform/posix/main.cpp b/xbmc/platform/posix/main.cpp
 index 01027f8..4cfb04e 100644
---- a/xbmc/main/main.cpp
-+++ b/xbmc/main/main.cpp
+--- a/xbmc/platform/posix/main.cpp
++++ b/xbmc/platform/posix/main.cpp
 @@ -41,12 +41,27 @@
  #include "input/linux/LIRC.h"
  #endif
- #include "XbmcContext.h"
+ #include "platform/XbmcContext.h"
 +#include "Application.h"
 +
 +void xbmc_term_handler(int signum)
diff --git a/packages/mediacenter/kodi/patches/kodi-100.12-prevent-kodi-switching-to-windowed-mode.patch b/packages/mediacenter/kodi/patches/kodi-100.12-prevent-kodi-switching-to-windowed-mode.patch
index 0bbb65f8bc..d11e02ded8 100644
--- a/packages/mediacenter/kodi/patches/kodi-100.12-prevent-kodi-switching-to-windowed-mode.patch
+++ b/packages/mediacenter/kodi/patches/kodi-100.12-prevent-kodi-switching-to-windowed-mode.patch
@@ -12,13 +12,13 @@ index 5ac2482..cb84940 100644
 --- a/xbmc/windowing/X11/WinSystemX11.h
 +++ b/xbmc/windowing/X11/WinSystemX11.h
 @@ -54,6 +54,7 @@ public:
-   virtual bool ResizeWindow(int newWidth, int newHeight, int newLeft, int newTop);
-   virtual bool SetFullScreen(bool fullScreen, RESOLUTION_INFO& res, bool blankOtherDisplays);
-   virtual void UpdateResolutions();
-+  virtual bool CanDoWindowed() { return false; }
-   virtual int  GetNumScreens() { return 1; }
-   virtual int  GetCurrentScreen() { return m_nScreen; }
-   virtual void ShowOSMouse(bool show);
+   bool ResizeWindow(int newWidth, int newHeight, int newLeft, int newTop) override;
+   bool SetFullScreen(bool fullScreen, RESOLUTION_INFO& res, bool blankOtherDisplays) override;
+   void UpdateResolutions() override;
++  bool CanDoWindowed() override { return false; }
+   int  GetNumScreens() override { return 1; }
+   int  GetCurrentScreen() override { return m_nScreen; }
+   void ShowOSMouse(bool show) override;
 -- 
 2.5.0
 
diff --git a/packages/mediacenter/kodi/patches/kodi-100.15-rename-default-pulse-device.patch b/packages/mediacenter/kodi/patches/kodi-100.15-rename-default-pulse-device.patch
index 6b4d5e3c54..7e96faaf3f 100644
--- a/packages/mediacenter/kodi/patches/kodi-100.15-rename-default-pulse-device.patch
+++ b/packages/mediacenter/kodi/patches/kodi-100.15-rename-default-pulse-device.patch
@@ -1,17 +1,18 @@
-From b70f963a0963735e627b12cf361e4e30d6c2a799 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Tue, 26 Jan 2016 08:31:52 +0100
-Subject: [PATCH] change pulseaudio default device name to Bluetooth Audio
+From cd60daafb0c6b1e1de94dbc944bb247a8f810b50 Mon Sep 17 00:00:00 2001
+From: fritsch <peter.fruehberger@gmail.com>
+Date: Fri, 29 Jan 2016 16:32:06 +0100
+Subject: [PATCH] [PATCH] change pulseaudio default device name to Bluetooth
+ Audio
 
 ---
  xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp
-index 67b7b90..8537163 100644
+index 7c669b7..043d0d5 100644
 --- a/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp
 +++ b/xbmc/cores/AudioEngine/Sinks/AESinkPULSE.cpp
-@@ -362,12 +362,14 @@ static void SinkInfoRequestCallback(pa_context *c, const pa_sink_info *i, int eo
+@@ -394,13 +394,15 @@ static void SinkInfoRequestCallback(pa_context *c, const pa_sink_info *i, int eo
      CAEDeviceInfo defaultDevice;
      defaultDevice.m_deviceName = std::string("Default");
      defaultDevice.m_displayName = std::string("Default");
@@ -21,6 +22,7 @@ index 67b7b90..8537163 100644
      defaultDevice.m_channels = CAEChannelInfo(AE_CH_LAYOUT_2_0);
      defaultDevice.m_sampleRates.assign(defaultSampleRates, defaultSampleRates + ARRAY_SIZE(defaultSampleRates));
      defaultDevice.m_deviceType = AE_DEVTYPE_PCM;
+     defaultDevice.m_wantsIECPassthrough = true;
      sinkStruct->list->push_back(defaultDevice);
 +    // OE only wants the default device - so we are done here
 +    return;
@@ -28,5 +30,5 @@ index 67b7b90..8537163 100644
    if (i && i->name)
    {
 -- 
-1.9.3
+2.5.0
 
diff --git a/packages/mediacenter/kodi/patches/kodi-100.30-fix-libdvd.patch b/packages/mediacenter/kodi/patches/kodi-100.30-fix-libdvd.patch
new file mode 100644
index 0000000000..0098ba0ec0
--- /dev/null
+++ b/packages/mediacenter/kodi/patches/kodi-100.30-fix-libdvd.patch
@@ -0,0 +1,19 @@
+diff -Naur a/Makefile.in b/Makefile.in
+--- a/Makefile.in	2016-02-25 15:54:54.305799646 +0000
++++ b/Makefile.in	2016-02-25 15:55:30.609990228 +0000
+@@ -11,7 +11,6 @@
+ 
+ DVDPCODECS_DIRS= \
+ 	lib \
+-	lib/libdvd
+ 
+ VideoPlayer_ARCHIVES=xbmc/cores/VideoPlayer/VideoPlayer.a \
+                    xbmc/cores/VideoPlayer/DVDCodecs/DVDCodecs.a \
+@@ -332,7 +331,6 @@
+ 	$(MAKE) -C lib/addons/library.kodi.peripheral
+ 	$(MAKE) -C lib/addons/library.xbmc.pvr
+ dvdpcodecs: dllloader
+-	$(MAKE) -C lib/libdvd
+ 
+ dvdpextcodecs:
+ 
diff --git a/packages/mediacenter/kodi/patches/kodi-999.10-aarch64-support.patch b/packages/mediacenter/kodi/patches/kodi-999.10-aarch64-support.patch
index f6353de3bb..9e6a061167 100644
--- a/packages/mediacenter/kodi/patches/kodi-999.10-aarch64-support.patch
+++ b/packages/mediacenter/kodi/patches/kodi-999.10-aarch64-support.patch
@@ -1,40 +1,8 @@
-From ae4e63aa165583ef5eaab4759e7af24eeffbb197 Mon Sep 17 00:00:00 2001
-From: croniccorey <cronmod.dev@gmail.com>
-Date: Sun, 13 Dec 2015 16:37:05 -0500
-Subject: [PATCH] KODI: Add support for aarch64 platform
-
----
- addons/library.xbmc.addon/libXBMC_addon.h   |  2 ++
- configure.ac                                | 12 ++++++++++++
- m4/xbmc_arch.m4                             |  2 +-
- xbmc/cores/DllLoader/DllLoader.h            |  2 +-
- xbmc/cores/DllLoader/ldt_keeper.c           |  2 +-
- xbmc/cores/VideoRenderers/LinuxRendererGL.h |  2 +-
- xbmc/linux/PlatformDefs.h                   |  2 +-
- xbmc/threads/Atomics.cpp                    |  2 +-
- xbmc/utils/CPUInfo.cpp                      |  2 +-
- xbmc/utils/MathUtils.h                      |  3 ++-
- 10 files changed, 23 insertions(+), 8 deletions(-)
-
-diff --git a/addons/library.xbmc.addon/libXBMC_addon.h b/addons/library.xbmc.addon/libXBMC_addon.h
-index c3ed54f..76190b6 100644
---- a/addons/library.xbmc.addon/libXBMC_addon.h
-+++ b/addons/library.xbmc.addon/libXBMC_addon.h
-@@ -55,6 +55,8 @@ typedef intptr_t      ssize_t;
- #define ADDON_HELPER_ARCH       "powerpc64-linux"
- #elif defined(__ARMEL__)
- #define ADDON_HELPER_ARCH       "arm"
-+#elif defined(__aarch64__)
-+#define ADDON_HELPER_ARCH       "aarch64"
- #elif defined(__mips__)
- #define ADDON_HELPER_ARCH       "mips"
- #else
-diff --git a/configure.ac b/configure.ac
-index c767357..80f3807 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -691,6 +691,18 @@ case $host in
-      use_wayland=no
+diff -Naur a/configure.ac b/configure.ac
+--- a/configure.ac	2016-02-14 00:19:36.000000000 +0100
++++ b/configure.ac	2016-03-02 09:15:52.504860258 +0100
+@@ -641,6 +641,18 @@
+      use_gl=no
       USE_STATIC_FFMPEG=1
       ;;
 +  aarch64*-*-linux-gnu*)
@@ -52,11 +20,10 @@ index c767357..80f3807 100644
    arm*-*linux-android*)
       target_platform=target_android
       use_arch="arm"
-diff --git a/m4/xbmc_arch.m4 b/m4/xbmc_arch.m4
-index 0b66a82..fa08537 100644
---- a/m4/xbmc_arch.m4
-+++ b/m4/xbmc_arch.m4
-@@ -60,7 +60,7 @@ case $host in
+diff -Naur a/m4/xbmc_arch.m4 b/m4/xbmc_arch.m4
+--- a/m4/xbmc_arch.m4	2016-02-14 00:19:42.000000000 +0100
++++ b/m4/xbmc_arch.m4	2016-03-02 09:16:33.341943374 +0100
+@@ -60,7 +60,7 @@
    powerpc64-*-linux-gnu*|powerpc64-*-linux-uclibc*)
       AC_SUBST(ARCH_DEFINES, "-DTARGET_POSIX -DTARGET_LINUX -D_LINUX -D_POWERPC64")
       ;;
@@ -65,95 +32,3 @@ index 0b66a82..fa08537 100644
       AC_SUBST(ARCH_DEFINES, "-DTARGET_POSIX -DTARGET_LINUX -D_LINUX")
       ;;
    mips*-*-linux-gnu*|mips*-*-linux-uclibc*)
-diff --git a/xbmc/cores/DllLoader/DllLoader.h b/xbmc/cores/DllLoader/DllLoader.h
-index 070aee6..e669203 100644
---- a/xbmc/cores/DllLoader/DllLoader.h
-+++ b/xbmc/cores/DllLoader/DllLoader.h
-@@ -23,7 +23,7 @@
- #include "coffldr.h"
- #include "LibraryLoader.h"
- 
--#if defined(__linux__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__mips__)
-+#if defined(__linux__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__mips__) && !defined(__aarch64__)
- #define USE_LDT_KEEPER
- #include "ldt_keeper.h"
- #endif
-diff --git a/xbmc/cores/DllLoader/ldt_keeper.c b/xbmc/cores/DllLoader/ldt_keeper.c
-index 8af9a86..1c0cdb2 100644
---- a/xbmc/cores/DllLoader/ldt_keeper.c
-+++ b/xbmc/cores/DllLoader/ldt_keeper.c
-@@ -19,7 +19,7 @@
-  */
- 
- //#ifndef __powerpc__
--#if !defined(__powerpc__) && !defined(__ppc__) && !defined(__arm__) && !defined(__mips__)
-+#if !defined(__powerpc__) && !defined(__ppc__) && !defined(__arm__) && !defined(__mips__) && !defined(__aarch64__)
- 
- #include "ldt_keeper.h"
- 
-diff --git a/xbmc/cores/VideoRenderers/LinuxRendererGL.h b/xbmc/cores/VideoRenderers/LinuxRendererGL.h
-index fcdea8d..5a3e3df 100644
---- a/xbmc/cores/VideoRenderers/LinuxRendererGL.h
-+++ b/xbmc/cores/VideoRenderers/LinuxRendererGL.h
-@@ -323,7 +323,7 @@ class CLinuxRendererGL : public CBaseRenderer
- 
- 
- inline int NP2( unsigned x ) {
--#if defined(TARGET_POSIX) && !defined(__POWERPC__) && !defined(__PPC__) && !defined(__arm__) && !defined(__mips__)
-+#if defined(TARGET_POSIX) && !defined(__POWERPC__) && !defined(__PPC__) && !defined(__arm__) && !defined(__mips__) && !defined(__aarch64__)
-   // If there are any issues compiling this, just append a ' && 0'
-   // to the above to make it '#if defined(TARGET_POSIX) && 0'
- 
-diff --git a/xbmc/linux/PlatformDefs.h b/xbmc/linux/PlatformDefs.h
-index 4350075..2c6fff4 100644
---- a/xbmc/linux/PlatformDefs.h
-+++ b/xbmc/linux/PlatformDefs.h
-@@ -161,7 +161,7 @@
- #define __int64   long long
- #define __uint64  unsigned long long
- 
--#if defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined (__arm__) || defined(__mips__) // should this be powerpc64 only?
-+#if defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined (__arm__) || defined(__mips__) || defined(__aarch64__)
- #define __stdcall
- #else /* !__x86_64__ */
- #define __stdcall   __attribute__((__stdcall__))
-diff --git a/xbmc/threads/Atomics.cpp b/xbmc/threads/Atomics.cpp
-index 417b2b6..c172867 100644
---- a/xbmc/threads/Atomics.cpp
-+++ b/xbmc/threads/Atomics.cpp
-@@ -106,7 +106,7 @@ long cas(volatile long *pAddr, long expectedVal, long swapVal)
- ///////////////////////////////////////////////////////////////////////////
- long long cas2(volatile long long* pAddr, long long expectedVal, long long swapVal)
- {
--#if defined(__ppc__) || defined(__powerpc__) || defined(__arm__)// PowerPC and ARM
-+#if defined(__ppc__) || defined(__powerpc__) || defined(__arm__) || defined(__aarch64__) // PowerPC and ARM
- // Not available/required
- // Hack to allow compilation
-   throw "cas2 is not implemented";
-diff --git a/xbmc/utils/CPUInfo.cpp b/xbmc/utils/CPUInfo.cpp
-index 9e709e7..d7fef37 100644
---- a/xbmc/utils/CPUInfo.cpp
-+++ b/xbmc/utils/CPUInfo.cpp
-@@ -914,7 +914,7 @@ void CCPUInfo::ReadCPUFeatures()
-   #endif
- #elif defined(LINUX)
- // empty on purpose, the implementation is in the constructor
--#elif !defined(__powerpc__) && !defined(__ppc__) && !defined(__arm__)
-+#elif !defined(__powerpc__) && !defined(__ppc__) && !defined(__arm__) && !defined(__aarch64__)
-   m_cpuFeatures |= CPU_FEATURE_MMX;
- #elif defined(__powerpc__) || defined(__ppc__)
-   m_cpuFeatures |= CPU_FEATURE_ALTIVEC;
-diff --git a/xbmc/utils/MathUtils.h b/xbmc/utils/MathUtils.h
-index 08140b7..249b4dd 100644
---- a/xbmc/utils/MathUtils.h
-+++ b/xbmc/utils/MathUtils.h
-@@ -35,7 +35,8 @@
- #if defined(__ppc__) || \
-     defined(__powerpc__) || \
-     defined(__mips__) || \
--    defined(__arm__)
-+    defined(__arm__) || \
-+    defined(__aarch64__)
-   #define DISABLE_MATHUTILS_ASM_ROUND_INT
- #endif
- 
diff --git a/packages/mediacenter/kodi/patches/kodi-999.11-fix-aarch64-compile.patch b/packages/mediacenter/kodi/patches/kodi-999.11-fix-aarch64-compile.patch
index 6ad637c5f7..56de975a26 100644
--- a/packages/mediacenter/kodi/patches/kodi-999.11-fix-aarch64-compile.patch
+++ b/packages/mediacenter/kodi/patches/kodi-999.11-fix-aarch64-compile.patch
@@ -1,7 +1,7 @@
-diff -Naur a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp	2016-02-02 23:09:39.000000000 +0100
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp	2016-02-10 07:01:28.941399058 +0100
-@@ -1598,7 +1598,7 @@
+diff -Naur a/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp	2016-02-14 00:19:32.000000000 +0100
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp	2016-03-02 09:36:04.503330833 +0100
+@@ -1601,7 +1601,7 @@
          am_private->gcodec.param = (void*)(EXTERNAL_PTS | SYNC_OUTSIDE);
        break;
    }
@@ -10,15 +10,3 @@ diff -Naur a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp b/xbmc/cores/dvdp
  
    // translate from generic to firemware version dependent
    m_dll->codec_init_para(&am_private->gcodec, &am_private->vcodec);
-diff -Naur a/xbmc/guilib/GUIFontTTFGL.cpp b/xbmc/guilib/GUIFontTTFGL.cpp
---- a/xbmc/guilib/GUIFontTTFGL.cpp	2016-02-02 23:09:39.000000000 +0100
-+++ b/xbmc/guilib/GUIFontTTFGL.cpp	2016-02-10 06:59:49.261293006 +0100
-@@ -232,7 +232,7 @@
-       glUniformMatrix4fv(modelLoc, 1, GL_FALSE, glMatrixModview.Get());
- 
-       // Bind the buffer to the OpenGL context's GL_ARRAY_BUFFER binding point
--      glBindBuffer(GL_ARRAY_BUFFER, (GLuint) m_vertexTrans[i].vertexBuffer->bufferHandle);
-+      glBindBuffer(GL_ARRAY_BUFFER, (unsigned long) m_vertexTrans[i].vertexBuffer->bufferHandle);
- 
-       // Do the actual drawing operation, split into groups of characters no
-       // larger than the pre-determined size of the element array
diff --git a/packages/mediacenter/kodi/patches/kodi-999.22-PR8254.patch b/packages/mediacenter/kodi/patches/kodi-999.22-PR8254.patch
deleted file mode 100644
index ef26dc153d..0000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.22-PR8254.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-From ee014b442eac3e85178c89d04691f4fc73cf89e4 Mon Sep 17 00:00:00 2001
-From: "Chris \"Koying\" Browet" <cbro@semperpax.com>
-Date: Sun, 18 Oct 2015 11:24:00 +0200
-Subject: [PATCH] FIX: Only handle 3D bitmap subs in TAB
-
-3D bitmap subs cannot be detected in SBS, and there is always the option
-to play with Kodi 3D disabled
----
- xbmc/cores/dvdplayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp | 7 +------
- 1 file changed, 1 insertion(+), 6 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp
-index dca4bdf..c9a0008 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp
-@@ -257,18 +257,13 @@ CDVDOverlay* CDVDOverlayCodecFFmpeg::GetOverlay()
-     }
- 
-     RENDER_STEREO_MODE render_stereo_mode = g_graphicsContext.GetStereoMode();
--    if (render_stereo_mode != RENDER_STEREO_MODE_OFF)
-+    if (render_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-     {
-       if (rect.h > m_height / 2)
-       {
-         m_height /= 2;
-         rect.h /= 2;
-       }
--      else if (rect.w > m_width / 2)
--      {
--        m_width /= 2;
--        rect.w /= 2;
--      }
-     }
- 
-     CDVDOverlayImage* overlay = new CDVDOverlayImage();
diff --git a/packages/mediacenter/kodi/patches/kodi-999.42-KEY_EPG.patch b/packages/mediacenter/kodi/patches/kodi-999.42-KEY_EPG.patch
deleted file mode 100644
index 5fc8ad80f3..0000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.42-KEY_EPG.patch
+++ /dev/null
@@ -1,63 +0,0 @@
-diff -Naur kodi-16.0-beta5-19fc4fa/system/keymaps/keyboard.xml kodi-16.0-beta5-19fc4fa.patch/system/keymaps/keyboard.xml
---- kodi-16.0-beta5-19fc4fa/system/keymaps/keyboard.xml	2015-12-15 00:07:12.000000000 +0100
-+++ kodi-16.0-beta5-19fc4fa.patch/system/keymaps/keyboard.xml	2016-01-05 06:41:50.857902588 +0100
-@@ -113,6 +113,7 @@
-       <sleep>ActivateWindow(shutdownmenu)</sleep>
-       <!-- PVR windows -->
-       <e>ActivateWindow(TVGuide)</e>
-+      <epg>ActivateWindow(TVGuide)</epg>
-       <h>ActivateWindow(TVChannels)</h>
-       <j>ActivateWindow(RadioChannels)</j>
-       <k>ActivateWindow(TVRecordings)</k>
-@@ -268,6 +269,7 @@
-   <TVGuide>
-     <keyboard>
-       <e>PreviousMenu</e>
-+      <epg>PreviousMenu</epg>
-     </keyboard>
-   </TVGuide>
-   <MyFiles>
-diff -Naur kodi-16.0-beta5-19fc4fa/xbmc/input/linux/LinuxInputDevices.cpp kodi-16.0-beta5-19fc4fa.patch/xbmc/input/linux/LinuxInputDevices.cpp
---- kodi-16.0-beta5-19fc4fa/xbmc/input/linux/LinuxInputDevices.cpp	2015-12-15 00:07:12.000000000 +0100
-+++ kodi-16.0-beta5-19fc4fa.patch/xbmc/input/linux/LinuxInputDevices.cpp	2016-01-05 06:41:13.847846527 +0100
-@@ -268,6 +268,7 @@
-   { KEY_FILE          , XBMCK_LAUNCH_FILE_BROWSER},
-   { KEY_SELECT        , XBMCK_RETURN      },
-   { KEY_CONFIG        , XBMCK_CONFIG      },
-+  { KEY_EPG           , XBMCK_EPG         },
-   // The Little Black Box Remote Additions
-   { 384               , XBMCK_LEFT        }, // Red
-   { 378               , XBMCK_RIGHT       }, // Green
-diff -Naur kodi-16.0-beta5-19fc4fa/xbmc/input/XBMC_keysym.h kodi-16.0-beta5-19fc4fa.patch/xbmc/input/XBMC_keysym.h
---- kodi-16.0-beta5-19fc4fa/xbmc/input/XBMC_keysym.h	2015-12-15 00:07:12.000000000 +0100
-+++ kodi-16.0-beta5-19fc4fa.patch/xbmc/input/XBMC_keysym.h	2016-01-05 06:41:13.847846527 +0100
-@@ -229,6 +229,7 @@
-   XBMCK_FAVORITES   = 0x14d,
-   XBMCK_HOMEPAGE    = 0x14e,
-   XBMCK_CONFIG      = 0x14f,
-+  XBMCK_EPG         = 0x150,
- 
-   // Add any other keys here
- 
-diff -Naur kodi-16.0-beta5-19fc4fa/xbmc/input/XBMC_keytable.cpp kodi-16.0-beta5-19fc4fa.patch/xbmc/input/XBMC_keytable.cpp
---- kodi-16.0-beta5-19fc4fa/xbmc/input/XBMC_keytable.cpp	2015-12-15 00:07:12.000000000 +0100
-+++ kodi-16.0-beta5-19fc4fa.patch/xbmc/input/XBMC_keytable.cpp	2016-01-05 06:41:13.848846528 +0100
-@@ -243,6 +243,7 @@
- , { XBMCK_FAVORITES,              0,    0, XBMCVK_FAVORITES,     "favorites" }
- , { XBMCK_HOMEPAGE ,              0,    0, XBMCVK_HOMEPAGE,      "homepage" }
- , { XBMCK_CONFIG,                 0,    0, XBMCVK_CONFIG,        "config" }
-+, { XBMCK_EPG   ,                 0,    0, XBMCVK_EPG,           "epg" }
- };
- 
- static int XBMCKeyTableSize = sizeof(XBMCKeyTable)/sizeof(XBMCKEYTABLE);
-diff -Naur kodi-16.0-beta5-19fc4fa/xbmc/input/XBMC_vkeys.h kodi-16.0-beta5-19fc4fa.patch/xbmc/input/XBMC_vkeys.h
---- kodi-16.0-beta5-19fc4fa/xbmc/input/XBMC_vkeys.h	2015-12-15 00:07:12.000000000 +0100
-+++ kodi-16.0-beta5-19fc4fa.patch/xbmc/input/XBMC_vkeys.h	2016-01-05 06:41:13.848846528 +0100
-@@ -221,6 +221,7 @@
-   XBMCVK_FAVORITES      = 0xE9,
-   XBMCVK_HOMEPAGE       = 0xEA,
-   XBMCVK_CONFIG         = 0xEB,
-+  XBMCVK_EPG            = 0xEC,
- 
-   XBMCVK_LAST           = 0xFF
- } XBMCVKey;
diff --git a/packages/mediacenter/kodi/scripts/kodi-config b/packages/mediacenter/kodi/scripts/kodi-config
index 78f9c8f232..d7a26fe100 100755
--- a/packages/mediacenter/kodi/scripts/kodi-config
+++ b/packages/mediacenter/kodi/scripts/kodi-config
@@ -23,7 +23,7 @@
 chmod +x /storage/.kodi/addons/*/bin/*
 
 # Nasty hack to work around OE to LE migration - Addons*.db needs to forget all about OE addons
-ADDONSDB=$(ls -1 /storage/.kodi/userdata/Database/Addons20.db 2>/dev/null)
+ADDONSDB=$(ls -1 /storage/.kodi/userdata/Database/Addons23.db 2>/dev/null)
 if [ -n "${ADDONSDB}" ]; then
   OEREPO="'repository.openelec.tv'"
 
diff --git a/projects/RPi/patches/kodi-theme-Confluence/kodi-theme-Confluence-001-jarvis-rbp-backports.patch b/projects/RPi/patches/kodi-theme-Confluence/kodi-theme-Confluence-001-jarvis-rbp-backports.patch
deleted file mode 100644
index 84ea8d1914..0000000000
--- a/projects/RPi/patches/kodi-theme-Confluence/kodi-theme-Confluence-001-jarvis-rbp-backports.patch
+++ /dev/null
@@ -1,117 +0,0 @@
-From 01759c5adfb050b1ba0c8a8fc4e20a875a98c0e5 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 10 Aug 2014 18:58:37 +0100
-Subject: [PATCH 1/2] confluence: Remove media-overlay.jpg for when video is
- backgrounded
-
----
- 720p/IncludesBackgroundBuilding.xml | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/720p/IncludesBackgroundBuilding.xml b/720p/IncludesBackgroundBuilding.xml
-index cc996da..d6aa595 100644
---- a/720p/IncludesBackgroundBuilding.xml
-+++ b/720p/IncludesBackgroundBuilding.xml
-@@ -32,7 +32,7 @@
- 		</control>
- 		<control type="image">
- 			<include>BackgroundDimensions</include>
--			<texture>special://skin/backgrounds/media-overlay.jpg</texture>
-+			<!--texture>special://skin/backgrounds/media-overlay.jpg</texture-->
- 			<visible>[Player.HasVideo + !Skin.HasSetting(ShowBackgroundVideo)] + !Window.IsVisible(TVChannels) + !Window.IsVisible(RadioChannels)</visible>
- 			<include>VisibleFadeEffect</include>
- 		</control>
--- 
-2.5.0
-
-
-From d99f70c094006144f07bdf739f5847b733030245 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 22 Jul 2013 23:19:15 +0100
-Subject: [PATCH 2/2] [confluence] Use animated gif as a cheaper working
- spinner
-
----
- 720p/DialogBusy.xml |   4 ++--
- media/busy.gif      | Bin 0 -> 3199 bytes
- 2 files changed, 2 insertions(+), 2 deletions(-)
- create mode 100644 media/busy.gif
-
-diff --git a/720p/DialogBusy.xml b/720p/DialogBusy.xml
-index b5df91f..8e84fed 100644
---- a/720p/DialogBusy.xml
-+++ b/720p/DialogBusy.xml
-@@ -25,9 +25,9 @@
- 				<top>20</top>
- 				<width>32</width>
- 				<height>32</height>
--				<texture>busy.png</texture>
-+				<texture>busy.gif</texture>
- 				<aspectratio>keep</aspectratio>
--				<animation effect="rotate" start="0" end="360" center="36,36" time="900" loop="true" condition="true">conditional</animation>
-+				<!--animation effect="rotate" start="0" end="360" center="36,36" time="900" loop="true" condition="true">conditional</animation-->
- 			</control>
- 			<control type="label">
- 				<description>Busy label</description>
-diff --git a/media/busy.gif b/media/busy.gif
-new file mode 100644
-index 0000000000000000000000000000000000000000..f856ed0b44fdc7e9b0520c7e39a9bebc04136897
-GIT binary patch
-literal 3199
-zcmc(fX;f2Z9>y=ZH#f_@I(Toefj|vNK&XX)Kv`7?5Fj9nfFK|eqk^a)i$D+vOGrT0
-z5Y~W_Kw^kiL|j0tWs!ndI9M0djt<b)1^0UFSf^(ku;<M510Uv`Ij8e}c;7GYd*1VV
-z{?GGp{e7KX5|{uJcmpi14<9}_J39-7LO(x0fj}@eHm22TEi5dIjEuazyk=%*-oJnU
-z@#5lU`YS}SUp)V5#7Wzkkg-$1%u32gWG3uM$ljZhAh1c=zbgeW{eW!p3E#}|4DqqI
-zVS{=L57sWeUi=h*9_f$yCnc!oCm!*6V)Hg%)Sx4b;3*G3y_Tf5-#ll{cC@v<x=)pA
-zDy$p)mTgGTcQYShbM#tm2>#Zlzxi?2cy3&gq6Zb5u5$@@ybycc*-E=i(!PE0#hKFr
-z8YwHA>k$o6chjKaEI)p!$wAoDL&l~=#<s>v-6#Yi*r{$k>ZaBy{Iun&<0smBO`6!}
-z&StP`{nE=0r$xK_71$KwBW33sD3@jYuko!${Lx^a-L6nm0XI6%X29c~=ih=uThmJ)
-z0`sx_;Y1skXCCRh-xeI%QSoNyoC$Wv8mqotx(oQi`#)m+fqj0FJ-Ty#;#jn=BdyK1
-z=B6qP9+rpm>D&te;)?^%P6X=O1hYo3EX2|KvKf!7<in@|#EeC0w2e)Mm}q{`poK|>
-zPC<6`-z>t4dt|sDMs?7$3qAAwp-9+}MG!&<X;8uHJ!}cJinh0cyE~Vi50<$Xx>HJG
-zR56mo+HAFw+XYc`x>Q}qx+NXw8e)#H8!s@v;-0BxXf+|F!d7fF@#<yem8N9j5SC=r
-zc$J(})qZ_E@sRLVqKtdLWG-?<)m``mc;5dsHW7am(_Qrv810mnzqP3UthRm2kuT0W
-z&lrxG68`O#Kj6Puz_+qnz<*@Nbri&j*VHS+Mo-Jb?JMMa0!wm-Z@4q@m?_7ZAll^_
-zDtpS%N^D4F2KKG&^e2?Q+9#gy(T;AXoq>p{m^y-r6hhl}(OBf7Z9T-XA592wChCS0
-zghE0dyCjN1p_XlCgFCZKa-d{)QY(vcG`ORqtd3Hv5q9cwWnZ4q9qQ6`p6gJt`i)yI
-z(k}USCso)D$=T8}kwkOttLQTdyk*$=N_P3zvI>gu=+%v7SlP7u{o}wK&@wvs;D^UR
-z_hrU2=hu&--McI1v8!sb^Bc*rSV@S1`PqwVQ`z9M6-zo`(fwU?)%?V6L!Su!wg%>{
-z_Ap6;JlsqFZs&4*_Hyca%X9UeJIQa&rOjqikQOUh?PJH$+b_COMJv9VKYa@bFkHLx
-zuWLOM>d88zwur244toBqkoynF%QKPCmFw0Ka@mD&l3lo%OWd7hoRh}_r7X*^);1h%
-zS0Y48mAJw~sY5l#+K!_g>SODUoZ~jI`sJw?F0nP#X1q5bEJf^ykCn3huW0t=Buk;;
-z><OG4E{)F{TT=~)r?<_`<?!07($8YgTA!>>ir2ELZoSz5-s3JdcW?Z_9fzG0p9c+f
-zF75u0<G0q%Pgq*{s0DHLH8@=}91=$HB+Fe*zx;<VO;%Lb%Fn7>#&QywaXZ%t%wx<X
-zvUdSJpAPsYepv&92^b?33aQ+l$}%1*we|81iQN&gY;M_x@Tm;K7TZ*q#I*>{@Fph2
-z8>d4#CVRatgPNLgq_yn;r95LtQ=1x5s+u(jzg=bD(&G)*)dYbk>S|JZ)~LT9sTL@u
-zECM`$!UZY^R=O23aIrSC7%KKzN9#j7fanO3;b9=-Uc<wOwcmJ`n?DRl3VQB76FuWr
-ze&-Y}Y7)5DF@p(38BssGAF%vvh1c?>$Ny%r0K&Q!3cxIY#cXS+p(!;!p<8t|C$aCa
-zmw^IV`QJZ}KB=L;O)P3iejKViBr3riXEU+Os6+HSe_FAg!SQwiv}qZC3nHl$W!6d@
-z71S4lX+^(KPOx(Lj8E2YV-Zqp(+-;Kb+HVT)!}eMBgt)5sl=u6D6D9$YloY~9dexu
-zC}LF_6A=VFoaEBm3%6lfCWlPwM@VL+UM3*w1;PDiz0>h(8VN;V8aTNv3fh%t9zvN$
-za|0i22m!`XM6L*S0Fdda0iPRhpr2%l$fxG?zK7#0^aKDQU~07$CjLum<cgsI^Cg?$
-z=^u0uFIa(TpbQ(@x~a@M#9Ht8tlKT~kX&o|rlNu8u|88ZKbw*Z>>`5g?|W04<I|k{
-zf8kS=oj={Y;@8;XOz~=TqntirW=YnyZvFmV#w8iwSj1b+vE)h1aIm71n3cCdS}6x>
-zWxRT@I!WkM-tG!!7x5q%CKh;dv|7?pi#(SC(n*+<A&fO4BC*gpt<Kw#?v@QANIy>0
-z_JkG+)}Xfm$~B~iw*|Y6V&e#09^^HH!`DEm|CuKkbWa-O@fsst>tv+k@8<e0IeRDQ
-z6%&;KwCx)MxzJKg`0~%-=L+cyz|v~xJYkuvQl4L~U`3Vi2q5TUQtYbI7P9j?Srnoc
-zBrcLeuot<fb1!`d3L1ehObS%-LknRP9YwAKlo7Z_N}NOuuOk?7WV8&K90Km+q>)}8
-zIQam3jP;F3e?*@>`tHd{w(%sqW=s(>)3Q9u09{KSdvd$zkgeI0Rru=vq=DF#Oe*oT
-zUC2Jj$)htMM58IqjK$$@6z*jZz{EVhT7wY1hm0T)9FAM%LDXn*5`t*q1rLmYg^^Y_
-z5X0LrM!i=K-<X=iiDNDt@Wv0=Q}2ho$4CwE7JL2`Pa{@kR#MJmqt?c3cFpEb!SRfx
-zJMm8m!h#Wq2wGEHCUm6MHllU%%Ic=_qa3j-1zZ#YL5LrPfOWb>AxOOl={&<{L5D6K
-z>;#ZUjACfyRUVK|-3P;Ja2nW>ZDDm2Hk|&>-$hEgc><W-n8G36B;oVn8+Z@Cd2$CA
-ze9w4E`#J92pw}150idYeiA<k5?!9CX9{*1!Afw-i3_cctKAXI5B~jNaPrTKK7b_A5
-z8uu_GFd~x6@lQ71$LFn<?BLbPFp)E*BsW1;@7%m!QBQAbu2V~jB5PcwwnP{{;2KoO
-zppv?~Ihmq-;Cvsc%*ZFPH~C~Y$qAr<mw-hT;=6l_LC`70-8O=J7mW7vmCd?rz=DQY
-z)LHJeZ=Y~xlivUbwT!#3*W1LsccOK@GL={@|AY`J{cjQwf&SitlwrC&U|`R?P4?+L
-zr)p}C0u>zg*B6Z$SRsZ~{lGhZr?x1$s1t=S{s}LNuEC0*wfD$uoK%8ST1Gwr7S<)+
-z9iovkGf8Q-PRuArxY*w<zSKAaAqKDZq2vZD6jGN+60GWIJ>t@#b!a<6hm>8dJ&H!R
-z$Oeur6$C~7PILzO;&4;G^(D-5m{@KUl(MQ1yKs>x+CMqUUllb3x?=fppG8_v>@)ZY
-xGUUb21Yb|%0k3;EZL|UA71ss70{N2%rZhjU!V8TtZxzYw_F;tVxR3z1e*p+ht@8i?
-
-literal 0
-HcmV?d00001
-
--- 
-2.5.0
-
diff --git a/projects/RPi/patches/kodi/kodi-001-jarvis-rbp-backports.patch b/projects/RPi/patches/kodi/kodi-001-jarvis-rbp-backports.patch
deleted file mode 100644
index fa712e8b6a..0000000000
--- a/projects/RPi/patches/kodi/kodi-001-jarvis-rbp-backports.patch
+++ /dev/null
@@ -1,52162 +0,0 @@
-From d11fabefb909e75e7186bd9ecd0cbff9e8b24577 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 7 Sep 2015 19:11:14 +0100
-Subject: [PATCH 01/93] Enable concealed error frames, but discard them when
- returned
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 17 ++++++++---------
- 1 file changed, 8 insertions(+), 9 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index bebe136..727a9ea 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -255,11 +255,14 @@ void CMMALVideo::dec_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf
-     if (buffer->length > 0)
-     {
-       assert(!(buffer->flags & MMAL_BUFFER_HEADER_FLAG_DECODEONLY));
-+      CMMALVideoBuffer *omvb = NULL;
-+      if (!g_advancedSettings.m_omxDecodeStartWithValidFrame || !(buffer->flags & MMAL_BUFFER_HEADER_FLAG_CORRUPTED))
-+        omvb = new CMMALVideoBuffer(this);
-+      if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+        CLog::Log(LOGDEBUG, "%s::%s - %p (%p) buffer_size(%u) dts:%.3f pts:%.3f flags:%x:%x",
-+          CLASSNAME, __func__, buffer, omvb, buffer->length, buffer->dts*1e-6, buffer->pts*1e-6, buffer->flags, buffer->type->video.flags);
-+      if (omvb)
-       {
--        CMMALVideoBuffer *omvb = new CMMALVideoBuffer(this);
--        if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--          CLog::Log(LOGDEBUG, "%s::%s - %p (%p) buffer_size(%u) dts:%.3f pts:%.3f flags:%x:%x",
--            CLASSNAME, __func__, buffer, omvb, buffer->length, buffer->dts*1e-6, buffer->pts*1e-6, buffer->flags, buffer->type->video.flags);
-         omvb->mmal_buffer = buffer;
-         buffer->user_data = (void *)omvb;
-         omvb->width = m_decoded_width;
-@@ -521,7 +524,6 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   m_hints = hints;
-   m_vout_input_pool = (MMAL_POOL_T *)options.m_opaque_pointer;
-   MMAL_STATUS_T status;
--  MMAL_PARAMETER_BOOLEAN_T error_concealment;
- 
-   m_decoded_width  = hints.width;
-   m_decoded_height = hints.height;
-@@ -630,10 +632,7 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   }
-   m_dec_input->format->flags |= MMAL_ES_FORMAT_FLAG_FRAMED;
- 
--  error_concealment.hdr.id = MMAL_PARAMETER_VIDEO_DECODE_ERROR_CONCEALMENT;
--  error_concealment.hdr.size = sizeof(MMAL_PARAMETER_BOOLEAN_T);
--  error_concealment.enable = g_advancedSettings.m_omxDecodeStartWithValidFrame;
--  status = mmal_port_parameter_set(m_dec_input, &error_concealment.hdr);
-+  status = mmal_port_parameter_set_boolean(m_dec_input, MMAL_PARAMETER_VIDEO_DECODE_ERROR_CONCEALMENT, MMAL_FALSE);
-   if (status != MMAL_SUCCESS)
-     CLog::Log(LOGERROR, "%s::%s Failed to disable error concealment on %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
- 
-
-From 5fb2a476f902f028de46e46863fdc74b4c021371 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 26 Aug 2015 21:47:41 +0100
-Subject: [PATCH 02/93] Reduce framerate of high framerate videos when not
- running fullscreen
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 14 +++++++++++++-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  2 ++
- 2 files changed, 15 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 727a9ea..8211e94 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -126,6 +126,8 @@ CMMALVideo::CMMALVideo()
-   m_es_format = mmal_format_alloc();
-   m_preroll = true;
-   m_speed = DVD_PLAYSPEED_NORMAL;
-+  m_fps = 0.0f;
-+  m_num_decoded = 0;
- }
- 
- CMMALVideo::~CMMALVideo()
-@@ -256,8 +258,15 @@ void CMMALVideo::dec_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf
-     {
-       assert(!(buffer->flags & MMAL_BUFFER_HEADER_FLAG_DECODEONLY));
-       CMMALVideoBuffer *omvb = NULL;
--      if (!g_advancedSettings.m_omxDecodeStartWithValidFrame || !(buffer->flags & MMAL_BUFFER_HEADER_FLAG_CORRUPTED))
-+      bool wanted = true;
-+      // we don't keep up when running at 60fps in the background so switch to half rate
-+      if (m_fps > 40.0f && !g_graphicsContext.IsFullScreenVideo() && !(m_num_decoded & 1))
-+        wanted = false;
-+      if (g_advancedSettings.m_omxDecodeStartWithValidFrame && (buffer->flags & MMAL_BUFFER_HEADER_FLAG_CORRUPTED))
-+        wanted = false;
-+      if (wanted)
-         omvb = new CMMALVideoBuffer(this);
-+      m_num_decoded++;
-       if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-         CLog::Log(LOGDEBUG, "%s::%s - %p (%p) buffer_size(%u) dts:%.3f pts:%.3f flags:%x:%x",
-           CLASSNAME, __func__, buffer, omvb, buffer->length, buffer->dts*1e-6, buffer->pts*1e-6, buffer->flags, buffer->type->video.flags);
-@@ -629,7 +638,10 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   {
-     m_dec_input->format->es->video.frame_rate.num = hints.fpsrate;
-     m_dec_input->format->es->video.frame_rate.den = hints.fpsscale;
-+    m_fps = hints.fpsrate / hints.fpsscale;
-   }
-+  else
-+    m_fps = 0.0f;
-   m_dec_input->format->flags |= MMAL_ES_FORMAT_FLAG_FRAMED;
- 
-   status = mmal_port_parameter_set_boolean(m_dec_input, MMAL_PARAMETER_VIDEO_DECODE_ERROR_CONCEALMENT, MMAL_FALSE);
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index d081b9c..0ea6ecd 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -123,6 +123,8 @@ protected:
-   bool SendCodecConfigData();
- 
-   CDVDStreamInfo    m_hints;
-+  float             m_fps;
-+  unsigned          m_num_decoded;
-   // Components
-   MMAL_INTERLACETYPE_T m_interlace_mode;
-   EINTERLACEMETHOD  m_interlace_method;
-
-From 8f815de22d00759496cd60139fb497d4064002cf Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 2 Dec 2015 20:08:05 +0000
-Subject: [PATCH 03/93] Remove preroll
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 12 ++----------
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  1 -
- 2 files changed, 2 insertions(+), 11 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 8211e94..8468db9 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -124,7 +124,6 @@ CMMALVideo::CMMALVideo()
- 
-   m_demux_queue_length = 0;
-   m_es_format = mmal_format_alloc();
--  m_preroll = true;
-   m_speed = DVD_PLAYSPEED_NORMAL;
-   m_fps = 0.0f;
-   m_num_decoded = 0;
-@@ -718,7 +717,6 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-     return false;
- 
-   Prime();
--  m_preroll = !m_hints.stills;
-   m_speed = DVD_PLAYSPEED_NORMAL;
- 
-   return true;
-@@ -874,13 +872,8 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-   double queued = m_decoderPts != DVD_NOPTS_VALUE && m_demuxerPts != DVD_NOPTS_VALUE ? m_demuxerPts - m_decoderPts : 0.0;
-   if (mmal_queue_length(m_dec_input_pool->queue) > 0 && !m_demux_queue_length && queued <= DVD_MSEC_TO_TIME(1000))
-     ret |= VC_BUFFER;
--  else
--    m_preroll = false;
--
--  if (m_preroll && m_output_ready.size() >= GetAllowedReferences())
--    m_preroll = false;
- 
--  if (!m_output_ready.empty() && !m_preroll)
-+  if (!m_output_ready.empty())
-   {
-     ret |= VC_PICTURE;
-   }
-@@ -888,7 +881,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     Sleep(10); // otherwise we busy spin
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) demux_queue(%d) space(%d) queued(%.2f) preroll(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), m_demux_queue_length, mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size, queued*1e-6, m_preroll);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) demux_queue(%d) space(%d) queued(%.2f)", CLASSNAME, __func__, ret, m_output_ready.size(), m_demux_queue_length, mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size, queued*1e-6);
- 
-   return ret;
- }
-@@ -958,7 +951,6 @@ void CMMALVideo::Reset(void)
-   }
-   m_decoderPts = DVD_NOPTS_VALUE;
-   m_demuxerPts = DVD_NOPTS_VALUE;
--  m_preroll = !m_hints.stills && (m_speed == DVD_PLAYSPEED_NORMAL || m_speed == DVD_PLAYSPEED_PAUSE);
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index 0ea6ecd..50ac0e3 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -131,7 +131,6 @@ protected:
-   double            m_demuxerPts;
-   double            m_decoderPts;
-   int               m_speed;
--  bool              m_preroll;
- 
-   CCriticalSection m_sharedSection;
-   MMAL_COMPONENT_T *m_dec;
-
-From 18c08e9c0410f43d2deec9d69e64eca7fdfd9a17 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 2 Dec 2015 22:35:11 +0000
-Subject: [PATCH 04/93] Remove demux queue
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 81 ++++------------------
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   | 10 ---
- 2 files changed, 12 insertions(+), 79 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 8468db9..61ae7e7 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -122,7 +122,6 @@ CMMALVideo::CMMALVideo()
- 
-   m_codingType = 0;
- 
--  m_demux_queue_length = 0;
-   m_es_format = mmal_format_alloc();
-   m_speed = DVD_PLAYSPEED_NORMAL;
-   m_fps = 0.0f;
-@@ -742,55 +741,13 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-   //  CLog::Log(LOGDEBUG, "%s::%s - %-8p %-6d dts:%.3f pts:%.3f ready_queue(%d)",
-   //    CLASSNAME, __func__, pData, iSize, dts == DVD_NOPTS_VALUE ? 0.0 : dts*1e-6, pts == DVD_NOPTS_VALUE ? 0.0 : pts*1e-6, m_output_ready.size());
- 
--  unsigned int demuxer_bytes = 0;
--  uint8_t *demuxer_content = NULL;
-   MMAL_BUFFER_HEADER_T *buffer;
-   MMAL_STATUS_T status;
- 
-   Prime();
--  // we need to queue then de-queue the demux packet, seems silly but
--  // mmal might not have an input buffer available when we are called
--  // and we must store the demuxer packet and try again later.
--  // try to send any/all demux packets to mmal decoder.
--  unsigned space = mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size;
--  if (pData && m_demux_queue.empty() && space >= (unsigned int)iSize)
--  {
--    demuxer_bytes = iSize;
--    demuxer_content = pData;
--  }
--  else if (pData && iSize)
--  {
--    mmal_demux_packet demux_packet;
--    demux_packet.dts = dts;
--    demux_packet.pts = pts;
--    demux_packet.size = iSize;
--    demux_packet.buff = new uint8_t[iSize];
--    memcpy(demux_packet.buff, pData, iSize);
--    m_demux_queue_length += demux_packet.size;
--    m_demux_queue.push(demux_packet);
--  }
--
--  uint8_t *buffer_to_free = NULL;
--
-   while (1)
-   {
--     space = mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size;
--     if (!demuxer_bytes && !m_demux_queue.empty())
--     {
--       mmal_demux_packet &demux_packet = m_demux_queue.front();
--       if (space >= (unsigned int)demux_packet.size)
--       {
--         // need to lock here to retrieve an input buffer and pop the queue
--         m_demux_queue_length -= demux_packet.size;
--         m_demux_queue.pop();
--         demuxer_bytes = (unsigned int)demux_packet.size;
--         demuxer_content = demux_packet.buff;
--         buffer_to_free = demux_packet.buff;
--         dts = demux_packet.dts;
--         pts = demux_packet.pts;
--       }
--     }
--     if (demuxer_content)
-+     if (pData)
-      {
-        // 500ms timeout
-        buffer = mmal_queue_timedwait(m_dec_input_pool->queue, 500);
-@@ -805,20 +762,20 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-        buffer->pts = pts == DVD_NOPTS_VALUE ? MMAL_TIME_UNKNOWN : pts;
-        buffer->dts = dts == DVD_NOPTS_VALUE ? MMAL_TIME_UNKNOWN : dts;
-        if (m_hints.ptsinvalid) buffer->pts = MMAL_TIME_UNKNOWN;
--       buffer->length = demuxer_bytes > buffer->alloc_size ? buffer->alloc_size : demuxer_bytes;
-+       buffer->length = (uint32_t)iSize > buffer->alloc_size ? buffer->alloc_size : (uint32_t)iSize;
-        // set a flag so we can identify primary frames from generated frames (deinterlace)
-        buffer->flags = MMAL_BUFFER_HEADER_FLAG_USER0;
- 
--       memcpy(buffer->data, demuxer_content, buffer->length);
--       demuxer_bytes   -= buffer->length;
--       demuxer_content += buffer->length;
-+       memcpy(buffer->data, pData, buffer->length);
-+       iSize -= buffer->length;
-+       pData += buffer->length;
- 
--       if (demuxer_bytes == 0)
-+       if (iSize == 0)
-          buffer->flags |= MMAL_BUFFER_HEADER_FLAG_FRAME_END;
- 
-        if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--         CLog::Log(LOGDEBUG, "%s::%s - %-8p %-6d/%-6d dts:%.3f pts:%.3f flags:%x ready_queue(%d) demux_queue(%d) space(%d)",
--            CLASSNAME, __func__, buffer, buffer->length, demuxer_bytes, dts == DVD_NOPTS_VALUE ? 0.0 : dts*1e-6, pts == DVD_NOPTS_VALUE ? 0.0 : pts*1e-6, buffer->flags, m_output_ready.size(), m_demux_queue_length, mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size);
-+         CLog::Log(LOGDEBUG, "%s::%s - %-8p %-6d/%-6d dts:%.3f pts:%.3f flags:%x ready_queue(%d)",
-+            CLASSNAME, __func__, buffer, buffer->length, iSize, dts == DVD_NOPTS_VALUE ? 0.0 : dts*1e-6, pts == DVD_NOPTS_VALUE ? 0.0 : pts*1e-6, buffer->flags, m_output_ready.size());
-        assert((int)buffer->length > 0);
-        status = mmal_port_send_buffer(m_dec_input, buffer);
-        if (status != MMAL_SUCCESS)
-@@ -827,7 +784,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-          return VC_ERROR;
-        }
- 
--       if (demuxer_bytes == 0)
-+       if (iSize == 0)
-        {
-          EDEINTERLACEMODE deinterlace_request = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_DeinterlaceMode;
-          EINTERLACEMETHOD interlace_method = g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
-@@ -851,17 +808,9 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-            DestroyDeinterlace();
-          if (deinterlace && !m_deint)
-            CreateDeinterlace(interlace_method);
--
--         if (buffer_to_free)
--         {
--           delete [] buffer_to_free;
--           buffer_to_free = NULL;
--           demuxer_content = NULL;
--           continue;
--         }
-        }
-     }
--    if (!demuxer_bytes)
-+    if (!iSize)
-       break;
-   }
-   int ret = 0;
-@@ -870,7 +819,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-   else if (dts != DVD_NOPTS_VALUE)
-     m_demuxerPts = dts;
-   double queued = m_decoderPts != DVD_NOPTS_VALUE && m_demuxerPts != DVD_NOPTS_VALUE ? m_demuxerPts - m_decoderPts : 0.0;
--  if (mmal_queue_length(m_dec_input_pool->queue) > 0 && !m_demux_queue_length && queued <= DVD_MSEC_TO_TIME(1000))
-+  if (mmal_queue_length(m_dec_input_pool->queue) > 0 && queued <= DVD_MSEC_TO_TIME(1000))
-     ret |= VC_BUFFER;
- 
-   if (!m_output_ready.empty())
-@@ -881,7 +830,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     Sleep(10); // otherwise we busy spin
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) demux_queue(%d) space(%d) queued(%.2f)", CLASSNAME, __func__, ret, m_output_ready.size(), m_demux_queue_length, mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size, queued*1e-6);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) queued(%.2f)", CLASSNAME, __func__, ret, m_output_ready.size(), queued*1e-6);
- 
-   return ret;
- }
-@@ -938,12 +887,6 @@ void CMMALVideo::Reset(void)
-       break;
-   }
- 
--  pthread_mutex_lock(&m_output_mutex);
--  while (!m_demux_queue.empty())
--    m_demux_queue.pop();
--  m_demux_queue_length = 0;
--  pthread_mutex_unlock(&m_output_mutex);
--
-   if (!m_finished)
-   {
-     SendCodecConfigData();
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index 50ac0e3..f4df09c 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -64,13 +64,6 @@ private:
- 
- class CMMALVideo : public CDVDVideoCodec
- {
--  typedef struct mmal_demux_packet {
--    uint8_t *buff;
--    int size;
--    double dts;
--    double pts;
--  } mmal_demux_packet;
--
- public:
-   CMMALVideo();
-   virtual ~CMMALVideo();
-@@ -110,9 +103,6 @@ protected:
-   float             m_aspect_ratio;
-   const char        *m_pFormatName;
- 
--  std::queue<mmal_demux_packet> m_demux_queue;
--  unsigned           m_demux_queue_length;
--
-   // mmal output buffers (video frames)
-   pthread_mutex_t   m_output_mutex;
-   std::queue<CMMALVideoBuffer*> m_output_ready;
-
-From 432994f3a9e9867d04d4c3d360476d72acea0a6c Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 2 Dec 2015 20:10:33 +0000
-Subject: [PATCH 05/93] Remove time based limit on submitted packets
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 24 +++-------------------
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  2 --
- 2 files changed, 3 insertions(+), 23 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 61ae7e7..1674fdd 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -108,8 +108,6 @@ CMMALVideo::CMMALVideo()
- 
-   m_interlace_mode = MMAL_InterlaceProgressive;
-   m_interlace_method = VS_INTERLACEMETHOD_NONE;
--  m_decoderPts = DVD_NOPTS_VALUE;
--  m_demuxerPts = DVD_NOPTS_VALUE;
- 
-   m_dec = NULL;
-   m_dec_input = NULL;
-@@ -814,23 +812,14 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-       break;
-   }
-   int ret = 0;
--  if (pts != DVD_NOPTS_VALUE)
--    m_demuxerPts = pts;
--  else if (dts != DVD_NOPTS_VALUE)
--    m_demuxerPts = dts;
--  double queued = m_decoderPts != DVD_NOPTS_VALUE && m_demuxerPts != DVD_NOPTS_VALUE ? m_demuxerPts - m_decoderPts : 0.0;
--  if (mmal_queue_length(m_dec_input_pool->queue) > 0 && queued <= DVD_MSEC_TO_TIME(1000))
--    ret |= VC_BUFFER;
- 
-   if (!m_output_ready.empty())
--  {
-     ret |= VC_PICTURE;
--  }
--  if (!ret)
--    Sleep(10); // otherwise we busy spin
-+  else
-+    ret |= VC_BUFFER;
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) queued(%.2f)", CLASSNAME, __func__, ret, m_output_ready.size(), queued*1e-6);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d)", CLASSNAME, __func__, ret, m_output_ready.size());
- 
-   return ret;
- }
-@@ -892,8 +881,6 @@ void CMMALVideo::Reset(void)
-     SendCodecConfigData();
-     Prime();
-   }
--  m_decoderPts = DVD_NOPTS_VALUE;
--  m_demuxerPts = DVD_NOPTS_VALUE;
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-@@ -982,11 +969,6 @@ bool CMMALVideo::GetPicture(DVDVideoPicture* pDvdVideoPicture)
-     return false;
-   }
- 
--  if (pDvdVideoPicture->pts != DVD_NOPTS_VALUE)
--    m_decoderPts = pDvdVideoPicture->pts;
--  else
--    m_decoderPts = pDvdVideoPicture->dts;
--
-   return true;
- }
- 
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index f4df09c..8f84557 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -118,8 +118,6 @@ protected:
-   // Components
-   MMAL_INTERLACETYPE_T m_interlace_mode;
-   EINTERLACEMETHOD  m_interlace_method;
--  double            m_demuxerPts;
--  double            m_decoderPts;
-   int               m_speed;
- 
-   CCriticalSection m_sharedSection;
-
-From 14ec8859335b4dc5add80bed34ce21ab3a4c8df4 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 7 Dec 2015 22:18:47 +0000
-Subject: [PATCH 06/93] Add back logging of data queued in decoder
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 22 +++++++++++++++++++++-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  2 ++
- 2 files changed, 23 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 1674fdd..35a9847 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -108,6 +108,8 @@ CMMALVideo::CMMALVideo()
- 
-   m_interlace_mode = MMAL_InterlaceProgressive;
-   m_interlace_method = VS_INTERLACEMETHOD_NONE;
-+  m_decoderPts = DVD_NOPTS_VALUE;
-+  m_demuxerPts = DVD_NOPTS_VALUE;
- 
-   m_dec = NULL;
-   m_dec_input = NULL;
-@@ -252,6 +254,11 @@ void CMMALVideo::dec_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf
-   {
-     if (buffer->length > 0)
-     {
-+      if (buffer->pts != MMAL_TIME_UNKNOWN)
-+        m_decoderPts = buffer->pts;
-+      else if (buffer->dts != MMAL_TIME_UNKNOWN)
-+        m_decoderPts = buffer->dts;
-+
-       assert(!(buffer->flags & MMAL_BUFFER_HEADER_FLAG_DECODEONLY));
-       CMMALVideoBuffer *omvb = NULL;
-       bool wanted = true;
-@@ -811,6 +818,17 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     if (!iSize)
-       break;
-   }
-+  if (pts != DVD_NOPTS_VALUE)
-+    m_demuxerPts = pts;
-+  else if (dts != DVD_NOPTS_VALUE)
-+    m_demuxerPts = dts;
-+
-+  if (m_demuxerPts != DVD_NOPTS_VALUE && m_decoderPts == DVD_NOPTS_VALUE)
-+    m_decoderPts = m_demuxerPts;
-+
-+  // we've built up quite a lot of data in decoder - try to throttle it
-+  double queued = m_decoderPts != DVD_NOPTS_VALUE && m_demuxerPts != DVD_NOPTS_VALUE ? m_demuxerPts - m_decoderPts : 0.0;
-+  bool full = queued > DVD_MSEC_TO_TIME(1000);
-   int ret = 0;
- 
-   if (!m_output_ready.empty())
-@@ -819,7 +837,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     ret |= VC_BUFFER;
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d)", CLASSNAME, __func__, ret, m_output_ready.size());
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) queued(%.2f) (%.2f:%.2f) full(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full);
- 
-   return ret;
- }
-@@ -881,6 +899,8 @@ void CMMALVideo::Reset(void)
-     SendCodecConfigData();
-     Prime();
-   }
-+  m_decoderPts = DVD_NOPTS_VALUE;
-+  m_demuxerPts = DVD_NOPTS_VALUE;
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index 8f84557..f4df09c 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -118,6 +118,8 @@ protected:
-   // Components
-   MMAL_INTERLACETYPE_T m_interlace_mode;
-   EINTERLACEMETHOD  m_interlace_method;
-+  double            m_demuxerPts;
-+  double            m_decoderPts;
-   int               m_speed;
- 
-   CCriticalSection m_sharedSection;
-
-From 61928feb51d23e4550abfbf8ab26e933ff1fec4e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 8 Dec 2015 11:40:17 +0000
-Subject: [PATCH 07/93] Try to minimise latency through hardware decoder. This
- could reduce performance but keeps videoplayer happier
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 42 +++++++++++++++++-----
- 1 file changed, 33 insertions(+), 9 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 35a9847..f96cc14 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -659,14 +659,21 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   if (status != MMAL_SUCCESS)
-     CLog::Log(LOGERROR, "%s::%s Failed to disable interpolate timestamps mode on %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
- 
-+  // limit number of callback structures in video_decode to reduce latency. Too low and video hangs.
-+  // negative numbers have special meaning. -1=size of DPB -2=size of DPB+1
-+  status = mmal_port_parameter_set_uint32(m_dec_input, MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS, -3);
-+  if (status != MMAL_SUCCESS)
-+    CLog::Log(LOGERROR, "%s::%s Failed to configure max num callbacks on %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
-+
-   status = mmal_port_format_commit(m_dec_input);
-   if (status != MMAL_SUCCESS)
-   {
-     CLog::Log(LOGERROR, "%s::%s Failed to commit format for decoder input port %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
-     return false;
-   }
--  m_dec_input->buffer_size = m_dec_input->buffer_size_recommended;
--  m_dec_input->buffer_num = m_dec_input->buffer_num_recommended;
-+  // use a small number of large buffers to keep latency under control
-+  m_dec_input->buffer_size = 1024*1024;
-+  m_dec_input->buffer_num = 2;
- 
-   m_dec_input->userdata = (struct MMAL_PORT_USERDATA_T *)this;
-   status = mmal_port_enable(m_dec_input, dec_input_port_cb_static);
-@@ -755,13 +762,15 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-      if (pData)
-      {
-        // 500ms timeout
--       buffer = mmal_queue_timedwait(m_dec_input_pool->queue, 500);
--       if (!buffer)
-        {
--         CLog::Log(LOGERROR, "%s::%s - mmal_queue_get failed", CLASSNAME, __func__);
--         return VC_ERROR;
-+         CSingleExit unlock(m_sharedSection);
-+         buffer = mmal_queue_timedwait(m_dec_input_pool->queue, 500);
-+         if (!buffer)
-+         {
-+           CLog::Log(LOGERROR, "%s::%s - mmal_queue_get failed", CLASSNAME, __func__);
-+           return VC_ERROR;
-+         }
-        }
--
-        mmal_buffer_header_reset(buffer);
-        buffer->cmd = 0;
-        buffer->pts = pts == DVD_NOPTS_VALUE ? MMAL_TIME_UNKNOWN : pts;
-@@ -833,11 +842,26 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
- 
-   if (!m_output_ready.empty())
-     ret |= VC_PICTURE;
--  else
-+  if (mmal_queue_length(m_dec_input_pool->queue) > 0)
-     ret |= VC_BUFFER;
- 
-+  bool slept = false;
-+  if (!ret)
-+  {
-+    slept = true;
-+    {
-+      // otherwise we busy spin
-+      CSingleExit unlock(m_sharedSection);
-+      Sleep(10);
-+    }
-+    if (!m_output_ready.empty())
-+      ret |= VC_PICTURE;
-+    if (mmal_queue_length(m_dec_input_pool->queue) > 0)
-+      ret |= VC_BUFFER;
-+  }
-+
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) queued(%.2f) (%.2f:%.2f) full(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) inputs(%d) slept(%d) queued(%.2f) (%.2f:%.2f) full(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), mmal_queue_length(m_dec_input_pool->queue), slept, queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full);
- 
-   return ret;
- }
-
-From 0d9c905db96e1b465a26c834430a1783c000a5a9 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 13 Jul 2015 19:27:25 +0100
-Subject: [PATCH 08/93] Enable QPU based deinterlace and remove resolution
- limit
-
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp         | 2 +-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 5 ++---
- xbmc/cores/omxplayer/OMXVideo.cpp                  | 6 +++---
- 3 files changed, 6 insertions(+), 7 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index 2941d34..bee3af1 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -679,7 +679,7 @@ bool CMMALRenderer::Supports(ESCALINGMETHOD method)
- 
- EINTERLACEMETHOD CMMALRenderer::AutoInterlaceMethod()
- {
--  return VS_INTERLACEMETHOD_MMAL_ADVANCED;
-+  return m_sourceWidth * m_sourceHeight <= 576 * 720 ? VS_INTERLACEMETHOD_MMAL_ADVANCED : VS_INTERLACEMETHOD_MMAL_BOB;
- }
- 
- void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect)
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index f96cc14..0dda9ad 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -363,12 +363,11 @@ bool CMMALVideo::CreateDeinterlace(EINTERLACEMETHOD interlace_method)
-     CLog::Log(LOGERROR, "%s::%s Failed to create deinterlace component (status=%x %s)", CLASSNAME, __func__, status, mmal_status_to_string(status));
-     return false;
-   }
--  bool advanced_deinterlace = (interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED || interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF) &&
--      m_decoded_width * m_decoded_height <= 576 * 720;
-+  bool advanced_deinterlace = interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED || interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF;
-   bool half_framerate = interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF || interlace_method == VS_INTERLACEMETHOD_MMAL_BOB_HALF;
- 
-   MMAL_PARAMETER_IMAGEFX_PARAMETERS_T imfx_param = {{MMAL_PARAMETER_IMAGE_EFFECT_PARAMETERS, sizeof(imfx_param)},
--        advanced_deinterlace ? MMAL_PARAM_IMAGEFX_DEINTERLACE_ADV : MMAL_PARAM_IMAGEFX_DEINTERLACE_FAST, 3, {3, 0, half_framerate }};
-+        advanced_deinterlace ? MMAL_PARAM_IMAGEFX_DEINTERLACE_ADV : MMAL_PARAM_IMAGEFX_DEINTERLACE_FAST, 4, {3, 0, half_framerate, 1 }};
- 
-   status = mmal_port_parameter_set(m_deint->output[0], &imfx_param.hdr);
-   if (status != MMAL_SUCCESS)
-diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
-index e50c13a..20ad4fa 100644
---- a/xbmc/cores/omxplayer/OMXVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXVideo.cpp
-@@ -253,8 +253,7 @@ bool COMXVideo::PortSettingsChanged()
-   if(m_deinterlace)
-   {
-     EINTERLACEMETHOD interlace_method = g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
--    bool advanced_deinterlace = (interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED || interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF) &&
--        port_image.format.video.nFrameWidth * port_image.format.video.nFrameHeight <= 576 * 720;
-+    bool advanced_deinterlace = interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED || interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF;
-     bool half_framerate = interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF || interlace_method == VS_INTERLACEMETHOD_MMAL_BOB_HALF;
-     if (!advanced_deinterlace)
-     {
-@@ -275,10 +274,11 @@ bool COMXVideo::PortSettingsChanged()
-     OMX_INIT_STRUCTURE(image_filter);
- 
-     image_filter.nPortIndex = m_omx_image_fx.GetOutputPort();
--    image_filter.nNumParams = 3;
-+    image_filter.nNumParams = 4;
-     image_filter.nParams[0] = 3;
-     image_filter.nParams[1] = 0;
-     image_filter.nParams[2] = half_framerate;
-+    image_filter.nParams[3] = 1; // qpu
-     if (!advanced_deinterlace)
-       image_filter.eImageFilter = OMX_ImageFilterDeInterlaceFast;
-     else
-
-From 6cfe8e3a2fa86dbb63830eea0b1f9617ea6c9ba0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 16 Aug 2015 15:46:33 +0100
-Subject: [PATCH 09/93] Allow deinterlace with software decode
-
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp | 9 +++++++++
- 1 file changed, 9 insertions(+)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index bee3af1..9b5c666 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -646,6 +646,13 @@ bool CMMALRenderer::Supports(EDEINTERLACEMODE mode)
- 
- bool CMMALRenderer::Supports(EINTERLACEMETHOD method)
- {
-+  if (m_format == RENDER_FMT_YUV420P)
-+  {
-+    if (method == VS_INTERLACEMETHOD_DEINTERLACE_HALF)
-+      return true;
-+    else
-+      return false;
-+  }
-   if (method == VS_INTERLACEMETHOD_AUTO)
-     return true;
-   if (method == VS_INTERLACEMETHOD_MMAL_ADVANCED)
-@@ -679,6 +686,8 @@ bool CMMALRenderer::Supports(ESCALINGMETHOD method)
- 
- EINTERLACEMETHOD CMMALRenderer::AutoInterlaceMethod()
- {
-+  if (m_format == RENDER_FMT_YUV420P)
-+    return VS_INTERLACEMETHOD_DEINTERLACE_HALF;
-   return m_sourceWidth * m_sourceHeight <= 576 * 720 ? VS_INTERLACEMETHOD_MMAL_ADVANCED : VS_INTERLACEMETHOD_MMAL_BOB;
- }
- 
-
-From d5c49bf267a9dd4baf7e6be9127548adf64d899b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 15 Sep 2015 22:26:26 +0100
-Subject: [PATCH 10/93] omxplayer: Don't use AutoInterlaceMethod it changes at
- start of file
-
----
- xbmc/cores/omxplayer/OMXHelper.cpp | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXHelper.cpp b/xbmc/cores/omxplayer/OMXHelper.cpp
-index de493a2..7251fc1 100644
---- a/xbmc/cores/omxplayer/OMXHelper.cpp
-+++ b/xbmc/cores/omxplayer/OMXHelper.cpp
-@@ -130,19 +130,19 @@ bool OMXDoProcessing(struct SOmxPlayerState &m_OmxPlayerState, int m_playSpeed,
-     bool audio_fifo_low = false, video_fifo_low = false, audio_fifo_high = false, video_fifo_high = false;
- 
-     if (m_OmxPlayerState.interlace_method == VS_INTERLACEMETHOD_MAX)
--      m_OmxPlayerState.interlace_method = g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
-+      m_OmxPlayerState.interlace_method = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod;
- 
-     // if deinterlace setting has changed, we should close and open video
-     if (m_OmxPlayerState.current_deinterlace != CMediaSettings::GetInstance().GetCurrentVideoSettings().m_DeinterlaceMode ||
-        (m_OmxPlayerState.current_deinterlace != VS_DEINTERLACEMODE_OFF &&
--        m_OmxPlayerState.interlace_method != g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod)))
-+        m_OmxPlayerState.interlace_method != CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod))
-     {
-       CLog::Log(LOGNOTICE, "%s - Reopen stream due to interlace change (%d,%d,%d,%d)", __FUNCTION__,
-         m_OmxPlayerState.current_deinterlace, CMediaSettings::GetInstance().GetCurrentVideoSettings().m_DeinterlaceMode,
--        m_OmxPlayerState.interlace_method, g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod));
-+        m_OmxPlayerState.interlace_method, CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
- 
-       m_OmxPlayerState.current_deinterlace = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_DeinterlaceMode;
--      m_OmxPlayerState.interlace_method    = g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
-+      m_OmxPlayerState.interlace_method    = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod;
-       reopen_stream = true;
-     }
- 
-
-From cb890fdeed45ff016c15f321d00f6cfe9cc3685d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 16 Sep 2015 15:05:37 +0100
-Subject: [PATCH 11/93] Avoid calling render callback with the lock held to
- avoid a deadlock
-
----
- xbmc/cores/omxplayer/OMXVideo.cpp | 27 ++++++++++++++-------------
- xbmc/cores/omxplayer/OMXVideo.h   | 10 +++++++++-
- 2 files changed, 23 insertions(+), 14 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
-index 20ad4fa..eb13e6f 100644
---- a/xbmc/cores/omxplayer/OMXVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXVideo.cpp
-@@ -141,9 +141,8 @@ bool COMXVideo::NaluFormatStartCodes(enum AVCodecID codec, uint8_t *in_extradata
-   return false;    
- }
- 
--bool COMXVideo::PortSettingsChanged()
-+bool COMXVideo::PortSettingsChanged(ResolutionUpdateInfo &resinfo)
- {
--  CSingleLock lock (m_critSection);
-   OMX_ERRORTYPE omx_err   = OMX_ErrorNone;
- 
-   if (m_settings_changed)
-@@ -186,15 +185,13 @@ bool COMXVideo::PortSettingsChanged()
-       port_image.format.video.xFramerate / (float)(1<<16), interlace.eMode, m_deinterlace);
- 
-   // let OMXPlayerVideo know about resolution so it can inform RenderManager
--  if (m_res_callback)
--  {
--    float display_aspect = 0.0f;
--    if (pixel_aspect.nX && pixel_aspect.nY)
--      display_aspect = (float)pixel_aspect.nX * port_image.format.video.nFrameWidth /
--        ((float)pixel_aspect.nY * port_image.format.video.nFrameHeight);
--    m_res_callback(m_res_ctx, port_image.format.video.nFrameWidth, port_image.format.video.nFrameHeight,
--        port_image.format.video.xFramerate / (float)(1<<16), display_aspect);
--  }
-+  resinfo.width = port_image.format.video.nFrameWidth;
-+  resinfo.height = port_image.format.video.nFrameHeight;
-+  resinfo.framerate = port_image.format.video.xFramerate / (float)(1<<16);
-+  resinfo.display_aspect = 0.0f;
-+  resinfo.changed = true;
-+  if (pixel_aspect.nX && pixel_aspect.nY)
-+    resinfo.display_aspect = (float)pixel_aspect.nX * port_image.format.video.nFrameWidth / ((float)pixel_aspect.nY * port_image.format.video.nFrameHeight);
- 
-   if (m_settings_changed)
-   {
-@@ -802,10 +799,11 @@ int COMXVideo::Decode(uint8_t *pData, int iSize, double dts, double pts)
-       }
-       //CLog::Log(LOGINFO, "VideD: dts:%.0f pts:%.0f size:%d)\n", dts, pts, iSize);
- 
-+      ResolutionUpdateInfo resinfo = {};
-       omx_err = m_omx_decoder.WaitForEvent(OMX_EventPortSettingsChanged, 0);
-       if (omx_err == OMX_ErrorNone)
-       {
--        if(!PortSettingsChanged())
-+        if(!PortSettingsChanged(resinfo))
-         {
-           CLog::Log(LOGERROR, "%s::%s - error PortSettingsChanged omx_err(0x%08x)\n", CLASSNAME, __func__, omx_err);
-           return false;
-@@ -814,11 +812,14 @@ int COMXVideo::Decode(uint8_t *pData, int iSize, double dts, double pts)
-       omx_err = m_omx_decoder.WaitForEvent(OMX_EventParamOrConfigChanged, 0);
-       if (omx_err == OMX_ErrorNone)
-       {
--        if(!PortSettingsChanged())
-+        if(!PortSettingsChanged(resinfo))
-         {
-           CLog::Log(LOGERROR, "%s::%s - error PortSettingsChanged (EventParamOrConfigChanged) omx_err(0x%08x)\n", CLASSNAME, __func__, omx_err);
-         }
-       }
-+      lock.Leave();
-+      if (resinfo.changed && m_res_callback)
-+        m_res_callback(m_res_ctx, resinfo.width, resinfo.height, resinfo.framerate, resinfo.display_aspect);
-     }
-     return true;
- 
-diff --git a/xbmc/cores/omxplayer/OMXVideo.h b/xbmc/cores/omxplayer/OMXVideo.h
-index d0634bb..7baefa5 100644
---- a/xbmc/cores/omxplayer/OMXVideo.h
-+++ b/xbmc/cores/omxplayer/OMXVideo.h
-@@ -41,6 +41,14 @@
- 
- typedef void (*ResolutionUpdateCallBackFn)(void *ctx, uint32_t width, uint32_t height, float framerate, float display_aspect);
- 
-+struct ResolutionUpdateInfo {
-+  uint32_t width;
-+  uint32_t height;
-+  float framerate;
-+  float display_aspect;
-+  bool changed;
-+};
-+
- class COMXVideo
- {
- public:
-@@ -50,7 +58,7 @@ public:
-   // Required overrides
-   bool SendDecoderConfig();
-   bool Open(CDVDStreamInfo &hints, OMXClock *clock, EDEINTERLACEMODE deinterlace = VS_DEINTERLACEMODE_OFF, bool hdmi_clock_sync = false);
--  bool PortSettingsChanged();
-+  bool PortSettingsChanged(ResolutionUpdateInfo &resinfo);
-   void RegisterResolutionUpdateCallBack(void *ctx, ResolutionUpdateCallBackFn callback) { m_res_ctx = ctx; m_res_callback = callback; }
-   void Close(void);
-   unsigned int GetFreeSpace();
-
-From 364da740e395d2091293f521a4bde7806b3218a0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 16 Dec 2015 22:03:56 +0000
-Subject: [PATCH 12/93] Add settings option to enable MVC and frame packing
- support
-
----
- .../resource.language.en_gb/resources/strings.po   | 22 ++++++++++++++++++++++
- system/settings/rbp.xml                            | 14 ++++++++++++++
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp |  5 +++++
- xbmc/cores/omxplayer/OMXVideo.cpp                  |  5 +++++
- xbmc/settings/Settings.cpp                         |  2 ++
- xbmc/settings/Settings.h                           |  2 ++
- 6 files changed, 50 insertions(+)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index a697a61..01173ca 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -18110,3 +18110,25 @@ msgstr ""
- msgctxt "#38023"
- msgid "Set my rating"
- msgstr ""
-+
-+#: system/settings/rbp.xml
-+msgctxt "#38027"
-+msgid "Decode the stereo stream from 3D files"
-+msgstr ""
-+
-+#. Description of setting "Decode the stereo stream from 3D files" with label #38027
-+#: system/settings/rbp.xml
-+msgctxt "#38028"
-+msgid "If enabled, videos created in Multiview Video Coding (MVC) format can also be watched in stereoscopic 3D. MVC format is typically found on 3D Blu-rays.[CR]Note: Processing of this data may reduce playback performance, so only enable if you require stereoscopic 3D support."
-+msgstr ""
-+
-+#: system/settings/rbp.xml
-+msgctxt "#38029"
-+msgid "Enable Full HD HDMI modes for stereoscopic 3D"
-+msgstr ""
-+
-+#. Description of setting "Enable Full HD HDMI modes for stereoscopic 3D" with label #38029
-+#: system/settings/rbp.xml
-+msgctxt "#38030"
-+msgid "This option uses frame-packing to output full resolution for 3D through HDMI.[CR]Enabling this improves quality of Multiview Video Coding (MVC) videos, but may not be supported by all displays."
-+msgstr ""
-diff --git a/system/settings/rbp.xml b/system/settings/rbp.xml
-index 50fe36a..7a170c2 100644
---- a/system/settings/rbp.xml
-+++ b/system/settings/rbp.xml
-@@ -27,6 +27,13 @@
-           <control type="edit" format="integer" />
-         </setting>
-       </group>
-+      <group id="3">
-+        <setting id="videoplayer.supportmvc" type="boolean" label="38027" help="38028">
-+          <level>2</level>
-+          <default>true</default>
-+          <control type="toggle" />
-+        </setting>
-+      </group>
-     </category>
-     <category id="myvideos">
-       <group id="1">
-@@ -70,6 +77,13 @@
-           <control type="edit" format="integer" />
-         </setting>
-       </group>
-+      <group id="5">
-+        <setting id="videoscreen.framepacking" type="boolean" label="38029" help="38030">
-+          <level>2</level>
-+          <default>false</default>
-+          <control type="toggle" />
-+        </setting>
-+      </group>
-     </category>
-     <category id="audiooutput">
-       <group id="1">
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 0dda9ad..c09074d 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -551,6 +551,11 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-       // H.264
-       m_codingType = MMAL_ENCODING_H264;
-       m_pFormatName = "mmal-h264";
-+      if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC))
-+      {
-+        m_codingType = MMAL_ENCODING_MVC;
-+        m_pFormatName= "mmal-mvc";
-+      }
-     break;
-     case AV_CODEC_ID_H263:
-     case AV_CODEC_ID_MPEG4:
-diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
-index eb13e6f..ea8c0fc 100644
---- a/xbmc/cores/omxplayer/OMXVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXVideo.cpp
-@@ -427,6 +427,11 @@ bool COMXVideo::Open(CDVDStreamInfo &hints, OMXClock *clock, EDEINTERLACEMODE de
-           break;
-       }
-     }
-+    if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC))
-+    {
-+      m_codingType = OMX_VIDEO_CodingMVC;
-+      m_video_codec_name = "omx-mvc";
-+    }
-     break;
-     case AV_CODEC_ID_MPEG4:
-       // (role name) video_decoder.mpeg4
-diff --git a/xbmc/settings/Settings.cpp b/xbmc/settings/Settings.cpp
-index f50355b..5035cec 100644
---- a/xbmc/settings/Settings.cpp
-+++ b/xbmc/settings/Settings.cpp
-@@ -181,6 +181,7 @@ const std::string CSettings::SETTING_VIDEOPLAYER_USEVDA = "videoplayer.usevda";
- const std::string CSettings::SETTING_VIDEOPLAYER_USEMMAL = "videoplayer.usemmal";
- const std::string CSettings::SETTING_VIDEOPLAYER_USESTAGEFRIGHT = "videoplayer.usestagefright";
- const std::string CSettings::SETTING_VIDEOPLAYER_LIMITGUIUPDATE = "videoplayer.limitguiupdate";
-+const std::string CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC = "videoplayer.supportmvc";
- const std::string CSettings::SETTING_MYVIDEOS_SELECTACTION = "myvideos.selectaction";
- const std::string CSettings::SETTING_MYVIDEOS_EXTRACTFLAGS = "myvideos.extractflags";
- const std::string CSettings::SETTING_MYVIDEOS_EXTRACTCHAPTERTHUMBS = "myvideos.extractchapterthumbs";
-@@ -344,6 +345,7 @@ const std::string CSettings::SETTING_VIDEOSCREEN_VSYNC = "videoscreen.vsync";
- const std::string CSettings::SETTING_VIDEOSCREEN_GUICALIBRATION = "videoscreen.guicalibration";
- const std::string CSettings::SETTING_VIDEOSCREEN_TESTPATTERN = "videoscreen.testpattern";
- const std::string CSettings::SETTING_VIDEOSCREEN_LIMITEDRANGE = "videoscreen.limitedrange";
-+const std::string CSettings::SETTING_VIDEOSCREEN_FRAMEPACKING = "videoscreen.framepacking";
- const std::string CSettings::SETTING_AUDIOOUTPUT_AUDIODEVICE = "audiooutput.audiodevice";
- const std::string CSettings::SETTING_AUDIOOUTPUT_CHANNELS = "audiooutput.channels";
- const std::string CSettings::SETTING_AUDIOOUTPUT_CONFIG = "audiooutput.config";
-diff --git a/xbmc/settings/Settings.h b/xbmc/settings/Settings.h
-index 55e150d..f3ba426 100644
---- a/xbmc/settings/Settings.h
-+++ b/xbmc/settings/Settings.h
-@@ -137,6 +137,7 @@ public:
-   static const std::string SETTING_VIDEOPLAYER_USEMMAL;
-   static const std::string SETTING_VIDEOPLAYER_USESTAGEFRIGHT;
-   static const std::string SETTING_VIDEOPLAYER_LIMITGUIUPDATE;
-+  static const std::string SETTING_VIDEOPLAYER_SUPPORTMVC;
-   static const std::string SETTING_MYVIDEOS_SELECTACTION;
-   static const std::string SETTING_MYVIDEOS_EXTRACTFLAGS;
-   static const std::string SETTING_MYVIDEOS_EXTRACTCHAPTERTHUMBS;
-@@ -300,6 +301,7 @@ public:
-   static const std::string SETTING_VIDEOSCREEN_GUICALIBRATION;
-   static const std::string SETTING_VIDEOSCREEN_TESTPATTERN;
-   static const std::string SETTING_VIDEOSCREEN_LIMITEDRANGE;
-+  static const std::string SETTING_VIDEOSCREEN_FRAMEPACKING;
-   static const std::string SETTING_AUDIOOUTPUT_AUDIODEVICE;
-   static const std::string SETTING_AUDIOOUTPUT_CHANNELS;
-   static const std::string SETTING_AUDIOOUTPUT_CONFIG;
-
-From 71d3daeb3f44c6a7876415141e740464ce8b6c87 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 17 Dec 2015 15:38:34 +0000
-Subject: [PATCH 13/93] Don't adjust 3d rectangles in bypass mode
-
----
- xbmc/cores/VideoRenderers/BaseRenderer.cpp | 55 ++++++++++++++++--------------
- 1 file changed, 29 insertions(+), 26 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/BaseRenderer.cpp b/xbmc/cores/VideoRenderers/BaseRenderer.cpp
-index 7889cf8..d4bb306 100644
---- a/xbmc/cores/VideoRenderers/BaseRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/BaseRenderer.cpp
-@@ -673,35 +673,38 @@ void CBaseRenderer::ManageDisplay()
-     else if(stereo_view == RENDER_STEREO_VIEW_RIGHT) stereo_view = RENDER_STEREO_VIEW_LEFT;
-   }
- 
--  switch(stereo_mode)
-+  if (m_format != RENDER_FMT_BYPASS)
-   {
--    case CONF_FLAGS_STEREO_MODE_TAB:
--      // Those are flipped in y
--      if (m_format == RENDER_FMT_CVBREF || m_format == RENDER_FMT_MEDIACODEC)
--      {
--        if (stereo_view == RENDER_STEREO_VIEW_LEFT)
--          m_sourceRect.y1 += m_sourceRect.y2*0.5f;
--        else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
--          m_sourceRect.y2 *= 0.5f;
--      }
--      else
--      {
--        if (stereo_view == RENDER_STEREO_VIEW_LEFT)
--          m_sourceRect.y2 *= 0.5f;
--        else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
--          m_sourceRect.y1 += m_sourceRect.y2*0.5f;
--      }
--      break;
-+    switch(stereo_mode)
-+    {
-+      case CONF_FLAGS_STEREO_MODE_TAB:
-+        // Those are flipped in y
-+        if (m_format == RENDER_FMT_CVBREF || m_format == RENDER_FMT_MEDIACODEC)
-+        {
-+          if (stereo_view == RENDER_STEREO_VIEW_LEFT)
-+            m_sourceRect.y1 += m_sourceRect.y2*0.5f;
-+          else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
-+            m_sourceRect.y2 *= 0.5f;
-+        }
-+        else
-+        {
-+          if (stereo_view == RENDER_STEREO_VIEW_LEFT)
-+            m_sourceRect.y2 *= 0.5f;
-+          else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
-+            m_sourceRect.y1 += m_sourceRect.y2*0.5f;
-+        }
-+        break;
- 
--    case CONF_FLAGS_STEREO_MODE_SBS:
--      if     (stereo_view == RENDER_STEREO_VIEW_LEFT)
--        m_sourceRect.x2 *= 0.5f;
--      else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
--        m_sourceRect.x1 += m_sourceRect.x2*0.5f;
--      break;
-+      case CONF_FLAGS_STEREO_MODE_SBS:
-+        if     (stereo_view == RENDER_STEREO_VIEW_LEFT)
-+          m_sourceRect.x2 *= 0.5f;
-+        else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
-+          m_sourceRect.x1 += m_sourceRect.x2*0.5f;
-+        break;
- 
--    default:
--      break;
-+      default:
-+        break;
-+    }
-   }
- 
-   CalcNormalDisplayRect(m_viewRect.x1, m_viewRect.y1, m_viewRect.Width(), m_viewRect.Height(), GetAspectRatio() * CDisplaySettings::GetInstance().GetPixelRatio(), CDisplaySettings::GetInstance().GetZoomAmount(), CDisplaySettings::GetInstance().GetVerticalShift());
-
-From 5ebb280be9de4ce882de665215c8bbda0c072864 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 14 Mar 2015 12:38:08 +0000
-Subject: [PATCH 14/93] Switch to using transform flags for 3d modes
-
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp      | 100 +++++++++---------------
- xbmc/cores/omxplayer/OMXPlayerVideo.cpp         |  83 +++++---------------
- xbmc/cores/omxplayer/OMXVideo.cpp               |  36 ++++-----
- xbmc/cores/omxplayer/OMXVideo.h                 |   2 +-
- xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp |  11 ++-
- 5 files changed, 79 insertions(+), 153 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index 9b5c666..2dff194 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -452,11 +452,7 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-     return;
-   }
- 
--  if (g_graphicsContext.GetStereoMode())
--    g_graphicsContext.SetStereoView(RENDER_STEREO_VIEW_LEFT);
-   ManageDisplay();
--  if (g_graphicsContext.GetStereoMode())
--    g_graphicsContext.SetStereoView(RENDER_STEREO_VIEW_OFF);
- 
-   // if running bypass, then the player might need the src/dst rects
-   // for sizing video playback on a layer other than the gles layer.
-@@ -693,10 +689,8 @@ EINTERLACEMETHOD CMMALRenderer::AutoInterlaceMethod()
- 
- void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect)
- {
--  // we get called twice a frame for left/right. Can ignore the rights.
--  if (g_graphicsContext.GetStereoView() == RENDER_STEREO_VIEW_RIGHT)
--    return;
-   CSingleLock lock(m_sharedSection);
-+  assert(g_graphicsContext.GetStereoView() != RENDER_STEREO_VIEW_RIGHT);
- 
-   if (!m_vout_input)
-     return;
-@@ -707,6 +701,10 @@ void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect
-   bool stereo_invert                   = (m_iFlags & CONF_FLAGS_STEREO_CADANCE_RIGHT_LEFT) ? true : false;
-   RENDER_STEREO_MODE display_stereo_mode = g_graphicsContext.GetStereoMode();
- 
-+  // ignore video stereo mode when 3D display mode is disabled
-+  if (display_stereo_mode == RENDER_STEREO_MODE_OFF)
-+    video_stereo_mode = RENDER_STEREO_MODE_OFF;
-+
-   // fix up transposed video
-   if (m_renderOrientation == 90 || m_renderOrientation == 270)
-   {
-@@ -738,40 +736,17 @@ void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect
-   CRect gui(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iHeight);
-   CRect display(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenHeight);
- 
--  if (display_stereo_mode != RENDER_STEREO_MODE_OFF && display_stereo_mode != RENDER_STEREO_MODE_MONO)
--  switch (video_stereo_mode)
-+  if (display_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-   {
--  case RENDER_STEREO_MODE_SPLIT_VERTICAL:
--    // optimisation - use simpler display mode in common case of unscaled 3d with same display mode
--    if (video_stereo_mode == display_stereo_mode && DestRect.x1 == 0.0f && DestRect.x2 * 2.0f == gui.Width() && !stereo_invert)
--    {
--      SrcRect.x2 *= 2.0f;
--      DestRect.x2 *= 2.0f;
--      video_stereo_mode = RENDER_STEREO_MODE_OFF;
--      display_stereo_mode = RENDER_STEREO_MODE_OFF;
--    }
--    else if (display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_RED_CYAN || display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_GREEN_MAGENTA || display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_YELLOW_BLUE)
--    {
--      SrcRect.x2 *= 2.0f;
--    }
--    break;
--
--  case RENDER_STEREO_MODE_SPLIT_HORIZONTAL:
--    // optimisation - use simpler display mode in common case of unscaled 3d with same display mode
--    if (video_stereo_mode == display_stereo_mode && DestRect.y1 == 0.0f && DestRect.y2 * 2.0f == gui.Height() && !stereo_invert)
--    {
--      SrcRect.y2 *= 2.0f;
--      DestRect.y2 *= 2.0f;
--      video_stereo_mode = RENDER_STEREO_MODE_OFF;
--      display_stereo_mode = RENDER_STEREO_MODE_OFF;
--    }
--    else if (display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_RED_CYAN || display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_GREEN_MAGENTA || display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_YELLOW_BLUE)
--    {
--      SrcRect.y2 *= 2.0f;
--    }
--    break;
--
--  default: break;
-+    float width = DestRect.x2 - DestRect.x1;
-+    DestRect.x1 *= 2.0f;
-+    DestRect.x2 = DestRect.x1 + 2.0f * width;
-+  }
-+  else if (display_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+  {
-+    float height = DestRect.y2 - DestRect.y1;
-+    DestRect.y1 *= 2.0f;
-+    DestRect.y2 = DestRect.y1 + 2.0f * height;
-   }
- 
-   if (gui != display)
-@@ -787,7 +762,7 @@ void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect
-   MMAL_DISPLAYREGION_T region;
-   memset(&region, 0, sizeof region);
- 
--  region.set                 = MMAL_DISPLAY_SET_DEST_RECT|MMAL_DISPLAY_SET_SRC_RECT|MMAL_DISPLAY_SET_FULLSCREEN|MMAL_DISPLAY_SET_NOASPECT|MMAL_DISPLAY_SET_MODE;
-+  region.set                 = MMAL_DISPLAY_SET_DEST_RECT|MMAL_DISPLAY_SET_SRC_RECT|MMAL_DISPLAY_SET_FULLSCREEN|MMAL_DISPLAY_SET_NOASPECT|MMAL_DISPLAY_SET_MODE|MMAL_DISPLAY_SET_TRANSFORM;
-   region.dest_rect.x         = lrintf(DestRect.x1);
-   region.dest_rect.y         = lrintf(DestRect.y1);
-   region.dest_rect.width     = lrintf(DestRect.Width());
-@@ -800,35 +775,32 @@ void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect
- 
-   region.fullscreen = MMAL_FALSE;
-   region.noaspect = MMAL_TRUE;
-+  region.mode = MMAL_DISPLAY_MODE_LETTERBOX;
-+
-+  if (m_renderOrientation == 90)
-+    region.transform = MMAL_DISPLAY_ROT90;
-+  else if (m_renderOrientation == 180)
-+    region.transform = MMAL_DISPLAY_ROT180;
-+  else if (m_renderOrientation == 270)
-+    region.transform = MMAL_DISPLAY_ROT270;
-+  else
-+    region.transform = MMAL_DISPLAY_ROT0;
- 
--  if (m_renderOrientation)
--  {
--    region.set |= MMAL_DISPLAY_SET_TRANSFORM;
--    if (m_renderOrientation == 90)
--      region.transform = MMAL_DISPLAY_ROT90;
--    else if (m_renderOrientation == 180)
--      region.transform = MMAL_DISPLAY_ROT180;
--    else if (m_renderOrientation == 270)
--      region.transform = MMAL_DISPLAY_ROT270;
--    else assert(0);
--  }
--
--  if (video_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL && display_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    region.mode = MMAL_DISPLAY_MODE_STEREO_TOP_TO_TOP;
--  else if (video_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL && display_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    region.mode = MMAL_DISPLAY_MODE_STEREO_TOP_TO_LEFT;
--  else if (video_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL && display_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    region.mode = MMAL_DISPLAY_MODE_STEREO_LEFT_TO_TOP;
--  else if (video_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL && display_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    region.mode = MMAL_DISPLAY_MODE_STEREO_LEFT_TO_LEFT;
-+  if (m_video_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+    region.transform = (MMAL_DISPLAYTRANSFORM_T)(region.transform | DISPMANX_STEREOSCOPIC_TB);
-+  else if (m_video_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-+    region.transform = (MMAL_DISPLAYTRANSFORM_T)(region.transform | DISPMANX_STEREOSCOPIC_SBS);
-   else
--    region.mode = MMAL_DISPLAY_MODE_LETTERBOX;
-+    region.transform = (MMAL_DISPLAYTRANSFORM_T)(region.transform | DISPMANX_STEREOSCOPIC_MONO);
-+
-+  if (m_StereoInvert)
-+    region.transform = (MMAL_DISPLAYTRANSFORM_T)(region.transform | DISPMANX_STEREOSCOPIC_INVERT);
- 
-   MMAL_STATUS_T status = mmal_util_set_display_region(m_vout_input, &region);
-   if (status != MMAL_SUCCESS)
-     CLog::Log(LOGERROR, "%s::%s Failed to set display region (status=%x %s)", CLASSNAME, __func__, status, mmal_status_to_string(status));
- 
--  CLog::Log(LOGDEBUG, "%s::%s %d,%d,%d,%d -> %d,%d,%d,%d mode:%d", CLASSNAME, __func__,
-+  CLog::Log(LOGDEBUG, "%s::%s %d,%d,%d,%d -> %d,%d,%d,%d t:%x", CLASSNAME, __func__,
-       region.src_rect.x, region.src_rect.y, region.src_rect.width, region.src_rect.height,
--      region.dest_rect.x, region.dest_rect.y, region.dest_rect.width, region.dest_rect.height, region.mode);
-+      region.dest_rect.x, region.dest_rect.y, region.dest_rect.width, region.dest_rect.height, region.transform);
- }
-diff --git a/xbmc/cores/omxplayer/OMXPlayerVideo.cpp b/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-index 0e78de4..0e04360 100644
---- a/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-@@ -650,10 +650,6 @@ int OMXPlayerVideo::GetFreeSpace()
- 
- void OMXPlayerVideo::SetVideoRect(const CRect &InSrcRect, const CRect &InDestRect)
- {
--  // we get called twice a frame for left/right. Can ignore the rights.
--  if (g_graphicsContext.GetStereoView() == RENDER_STEREO_VIEW_RIGHT)
--    return;
--
-   CRect SrcRect = InSrcRect, DestRect = InDestRect;
-   unsigned flags = GetStereoModeFlags(GetStereoMode());
-   RENDER_STEREO_MODE video_stereo_mode = (flags & CONF_FLAGS_STEREO_MODE_SBS) ? RENDER_STEREO_MODE_SPLIT_VERTICAL :
-@@ -661,6 +657,10 @@ void OMXPlayerVideo::SetVideoRect(const CRect &InSrcRect, const CRect &InDestRec
-   bool stereo_invert                   = (flags & CONF_FLAGS_STEREO_CADANCE_RIGHT_LEFT) ? true : false;
-   RENDER_STEREO_MODE display_stereo_mode = g_graphicsContext.GetStereoMode();
- 
-+  // ignore video stereo mode when 3D display mode is disabled
-+  if (display_stereo_mode == RENDER_STEREO_MODE_OFF)
-+    video_stereo_mode = RENDER_STEREO_MODE_OFF;
-+
-   // fix up transposed video
-   if (m_hints.orientation == 90 || m_hints.orientation == 270)
-   {
-@@ -692,41 +692,17 @@ void OMXPlayerVideo::SetVideoRect(const CRect &InSrcRect, const CRect &InDestRec
-   CRect gui(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iHeight);
-   CRect display(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenHeight);
- 
--  switch (video_stereo_mode)
-+  if (display_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-   {
--  case RENDER_STEREO_MODE_SPLIT_VERTICAL:
--    // optimisation - use simpler display mode in common case of unscaled 3d with same display mode
--    if (video_stereo_mode == display_stereo_mode && DestRect.x1 == 0.0f && DestRect.x2 * 2.0f == gui.Width() && !stereo_invert)
--    {
--      SrcRect.x2 *= 2.0f;
--      DestRect.x2 *= 2.0f;
--      video_stereo_mode = RENDER_STEREO_MODE_OFF;
--      display_stereo_mode = RENDER_STEREO_MODE_OFF;
--    }
--    else if (stereo_invert)
--    {
--      SrcRect.x1 += m_hints.width / 2;
--      SrcRect.x2 += m_hints.width / 2;
--    }
--    break;
--
--  case RENDER_STEREO_MODE_SPLIT_HORIZONTAL:
--    // optimisation - use simpler display mode in common case of unscaled 3d with same display mode
--    if (video_stereo_mode == display_stereo_mode && DestRect.y1 == 0.0f && DestRect.y2 * 2.0f == gui.Height() && !stereo_invert)
--    {
--      SrcRect.y2 *= 2.0f;
--      DestRect.y2 *= 2.0f;
--      video_stereo_mode = RENDER_STEREO_MODE_OFF;
--      display_stereo_mode = RENDER_STEREO_MODE_OFF;
--    }
--    else if (stereo_invert)
--    {
--      SrcRect.y1 += m_hints.height / 2;
--      SrcRect.y2 += m_hints.height / 2;
--    }
--    break;
--
--  default: break;
-+    float width = DestRect.x2 - DestRect.x1;
-+    DestRect.x1 *= 2.0f;
-+    DestRect.x2 = DestRect.x1 + 2.0f * width;
-+  }
-+  else if (display_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+  {
-+    float height = DestRect.y2 - DestRect.y1;
-+    DestRect.y1 *= 2.0f;
-+    DestRect.y2 = DestRect.y1 + 2.0f * height;
-   }
- 
-   if (gui != display)
-@@ -738,7 +714,7 @@ void OMXPlayerVideo::SetVideoRect(const CRect &InSrcRect, const CRect &InDestRec
-     DestRect.y1 *= yscale;
-     DestRect.y2 *= yscale;
-   }
--  m_omxVideo.SetVideoRect(SrcRect, DestRect, video_stereo_mode, display_stereo_mode);
-+  m_omxVideo.SetVideoRect(SrcRect, DestRect, m_video_stereo_mode, m_display_stereo_mode, m_StereoInvert);
- }
- 
- void OMXPlayerVideo::RenderUpdateCallBack(const void *ctx, const CRect &SrcRect, const CRect &DestRect)
-@@ -753,40 +729,17 @@ void OMXPlayerVideo::ResolutionUpdateCallBack(uint32_t width, uint32_t height, f
-   uint32_t video_width   = CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenWidth;
-   uint32_t video_height  = CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenHeight;
- 
--  unsigned flags = 0;
-   ERenderFormat format = RENDER_FMT_BYPASS;
- 
-+  /* figure out steremode expected based on user settings and hints */
-+  unsigned flags = GetStereoModeFlags(GetStereoMode());
-+
-   if(m_bAllowFullscreen)
-   {
-     flags |= CONF_FLAGS_FULLSCREEN;
-     m_bAllowFullscreen = false; // only allow on first configure
-   }
- 
--  flags |= GetStereoModeFlags(GetStereoMode());
--
--  if(flags & CONF_FLAGS_STEREO_MODE_SBS)
--  {
--    if(g_Windowing.Support3D(video_width, video_height, D3DPRESENTFLAG_MODE3DSBS))
--      CLog::Log(LOGNOTICE, "3DSBS movie found");
--    else
--    {
--      flags &= ~CONF_FLAGS_STEREO_MODE_MASK(~0);
--      CLog::Log(LOGNOTICE, "3DSBS movie found but not supported");
--    }
--  }
--  else if(flags & CONF_FLAGS_STEREO_MODE_TAB)
--  {
--    if(g_Windowing.Support3D(video_width, video_height, D3DPRESENTFLAG_MODE3DTB))
--      CLog::Log(LOGNOTICE, "3DTB movie found");
--    else
--    {
--      flags &= ~CONF_FLAGS_STEREO_MODE_MASK(~0);
--      CLog::Log(LOGNOTICE, "3DTB movie found but not supported");
--    }
--  }
--  else
--    CLog::Log(LOGNOTICE, "not a 3D movie");
--
-   unsigned int iDisplayWidth  = width;
-   unsigned int iDisplayHeight = height;
- 
-diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
-index ea8c0fc..a9825a0 100644
---- a/xbmc/cores/omxplayer/OMXVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXVideo.cpp
-@@ -217,15 +217,6 @@ bool COMXVideo::PortSettingsChanged(ResolutionUpdateInfo &resinfo)
-   OMX_INIT_STRUCTURE(configDisplay);
-   configDisplay.nPortIndex = m_omx_render.GetInputPort();
- 
--  configDisplay.set = OMX_DISPLAY_SET_TRANSFORM;
--  configDisplay.transform = m_transform;
--  omx_err = m_omx_render.SetConfig(OMX_IndexConfigDisplayRegion, &configDisplay);
--  if(omx_err != OMX_ErrorNone)
--  {
--    CLog::Log(LOGWARNING, "%s::%s - could not set transform : %d", CLASSNAME, __func__, m_transform);
--    return false;
--  }
--
-   if(m_hdmi_clock_sync)
-   {
-     OMX_CONFIG_LATENCYTARGETTYPE latencyTarget;
-@@ -847,7 +838,7 @@ void COMXVideo::Reset(void)
- }
- 
- ///////////////////////////////////////////////////////////////////////////////////////////
--void COMXVideo::SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER_STEREO_MODE video_mode, RENDER_STEREO_MODE display_mode)
-+void COMXVideo::SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER_STEREO_MODE video_mode, RENDER_STEREO_MODE display_mode, bool stereo_invert)
- {
-   CSingleLock lock (m_critSection);
-   if(!m_is_open)
-@@ -857,7 +848,7 @@ void COMXVideo::SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER
- 
-   OMX_INIT_STRUCTURE(configDisplay);
-   configDisplay.nPortIndex = m_omx_render.GetInputPort();
--  configDisplay.set                 = (OMX_DISPLAYSETTYPE)(OMX_DISPLAY_SET_DEST_RECT|OMX_DISPLAY_SET_SRC_RECT|OMX_DISPLAY_SET_FULLSCREEN|OMX_DISPLAY_SET_NOASPECT|OMX_DISPLAY_SET_MODE);
-+  configDisplay.set                 = (OMX_DISPLAYSETTYPE)(OMX_DISPLAY_SET_DEST_RECT|OMX_DISPLAY_SET_SRC_RECT|OMX_DISPLAY_SET_FULLSCREEN|OMX_DISPLAY_SET_NOASPECT|OMX_DISPLAY_SET_MODE|OMX_DISPLAY_SET_TRANSFORM);
-   configDisplay.dest_rect.x_offset  = lrintf(DestRect.x1);
-   configDisplay.dest_rect.y_offset  = lrintf(DestRect.y1);
-   configDisplay.dest_rect.width     = lrintf(DestRect.Width());
-@@ -870,23 +861,24 @@ void COMXVideo::SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER
- 
-   configDisplay.fullscreen = OMX_FALSE;
-   configDisplay.noaspect = OMX_TRUE;
-+  configDisplay.mode = OMX_DISPLAY_MODE_LETTERBOX;
-+  configDisplay.transform = m_transform;
- 
--  if (video_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL && display_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    configDisplay.mode = OMX_DISPLAY_MODE_STEREO_TOP_TO_TOP;
--  else if (video_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL && display_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    configDisplay.mode = OMX_DISPLAY_MODE_STEREO_TOP_TO_LEFT;
--  else if (video_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL && display_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    configDisplay.mode = OMX_DISPLAY_MODE_STEREO_LEFT_TO_TOP;
--  else if (video_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL && display_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    configDisplay.mode = OMX_DISPLAY_MODE_STEREO_LEFT_TO_LEFT;
-+  if (video_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+    configDisplay.transform = (OMX_DISPLAYTRANSFORMTYPE)(configDisplay.transform | DISPMANX_STEREOSCOPIC_TB);
-+  else if (video_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-+    configDisplay.transform = (OMX_DISPLAYTRANSFORMTYPE)(configDisplay.transform | DISPMANX_STEREOSCOPIC_SBS);
-   else
--    configDisplay.mode = OMX_DISPLAY_MODE_LETTERBOX;
-+    configDisplay.transform = (OMX_DISPLAYTRANSFORMTYPE)(configDisplay.transform | DISPMANX_STEREOSCOPIC_MONO);
-+
-+  if (stereo_invert)
-+    configDisplay.transform = (OMX_DISPLAYTRANSFORMTYPE)(configDisplay.transform | DISPMANX_STEREOSCOPIC_INVERT);
- 
-   m_omx_render.SetConfig(OMX_IndexConfigDisplayRegion, &configDisplay);
- 
--  CLog::Log(LOGDEBUG, "%s::%s %d,%d,%d,%d -> %d,%d,%d,%d mode:%d", CLASSNAME, __func__,
-+  CLog::Log(LOGDEBUG, "%s::%s %d,%d,%d,%d -> %d,%d,%d,%d t:%x", CLASSNAME, __func__,
-       configDisplay.src_rect.x_offset, configDisplay.src_rect.y_offset, configDisplay.src_rect.width, configDisplay.src_rect.height,
--      configDisplay.dest_rect.x_offset, configDisplay.dest_rect.y_offset, configDisplay.dest_rect.width, configDisplay.dest_rect.height, configDisplay.mode);
-+      configDisplay.dest_rect.x_offset, configDisplay.dest_rect.y_offset, configDisplay.dest_rect.width, configDisplay.dest_rect.height, configDisplay.transform);
- }
- 
- int COMXVideo::GetInputBufferSize()
-diff --git a/xbmc/cores/omxplayer/OMXVideo.h b/xbmc/cores/omxplayer/OMXVideo.h
-index 7baefa5..31982b4 100644
---- a/xbmc/cores/omxplayer/OMXVideo.h
-+++ b/xbmc/cores/omxplayer/OMXVideo.h
-@@ -67,7 +67,7 @@ public:
-   void Reset(void);
-   void SetDropState(bool bDrop);
-   std::string GetDecoderName() { return m_video_codec_name; };
--  void SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER_STEREO_MODE video_mode, RENDER_STEREO_MODE display_mode);
-+  void SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER_STEREO_MODE video_mode, RENDER_STEREO_MODE display_mode, bool stereo_invert);
-   int GetInputBufferSize();
-   bool GetPlayerInfo(double &match, double &phase, double &pll);
-   void SubmitEOS();
-diff --git a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-index 5d5b74b..443d037 100644
---- a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-+++ b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-@@ -234,7 +234,9 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
-       /* inform TV of any 3D settings. Note this property just applies to next hdmi mode change, so no need to call for 2D modes */
-       HDMI_PROPERTY_PARAM_T property;
-       property.property = HDMI_PROPERTY_3D_STRUCTURE;
--      if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-+      if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOSCREEN_FRAMEPACKING) && CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC) && res.fRefreshRate <= 30.0f)
-+        property.param1 = HDMI_3D_FORMAT_FRAME_PACKING;
-+      else if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-         property.param1 = HDMI_3D_FORMAT_SBS_HALF;
-       else if (res.dwFlags & D3DPRESENTFLAG_MODE3DTB)
-         property.param1 = HDMI_3D_FORMAT_TB_HALF;
-@@ -334,6 +336,13 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
-   DISPMANX_TRANSFORM_T transform = DISPMANX_NO_ROTATE;
-   DISPMANX_UPDATE_HANDLE_T dispman_update = m_DllBcmHost->vc_dispmanx_update_start(0);
- 
-+  if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-+    transform = DISPMANX_STEREOSCOPIC_SBS;
-+  else if (res.dwFlags & D3DPRESENTFLAG_MODE3DTB)
-+    transform = DISPMANX_STEREOSCOPIC_TB;
-+  else
-+    transform = DISPMANX_STEREOSCOPIC_MONO;
-+
-   CLog::Log(LOGDEBUG, "EGL set resolution %dx%d -> %dx%d @ %.2f fps (%d,%d) flags:%x aspect:%.2f\n",
-       m_width, m_height, dst_rect.width, dst_rect.height, res.fRefreshRate, GETFLAGS_GROUP(res.dwFlags), GETFLAGS_MODE(res.dwFlags), (int)res.dwFlags, res.fPixelRatio);
- 
-
-From 2be3612226ee01a6d294c6ca6a7d8d0849bd4221 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 21 Jun 2015 17:42:03 +0100
-Subject: [PATCH 15/93] Remove unused Support3D function
-
----
- xbmc/windowing/egl/WinSystemEGL.cpp | 29 -----------------------------
- xbmc/windowing/egl/WinSystemEGL.h   |  1 -
- 2 files changed, 30 deletions(-)
-
-diff --git a/xbmc/windowing/egl/WinSystemEGL.cpp b/xbmc/windowing/egl/WinSystemEGL.cpp
-index 696ded1..718fb4c 100644
---- a/xbmc/windowing/egl/WinSystemEGL.cpp
-+++ b/xbmc/windowing/egl/WinSystemEGL.cpp
-@@ -531,35 +531,6 @@ EGLConfig CWinSystemEGL::GetEGLConfig()
-   return m_config;
- }
- 
--// the logic in this function should match whether CBaseRenderer::FindClosestResolution picks a 3D mode
--bool CWinSystemEGL::Support3D(int width, int height, uint32_t mode) const
--{
--  RESOLUTION_INFO &curr = CDisplaySettings::GetInstance().GetResolutionInfo(g_graphicsContext.GetVideoResolution());
--
--  // if we are using automatic hdmi mode switching
--  if (CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_ADJUSTREFRESHRATE) != ADJUST_REFRESHRATE_OFF)
--  {
--    int searchWidth = curr.iScreenWidth;
--    int searchHeight = curr.iScreenHeight;
--
--    // only search the custom resolutions
--    for (unsigned int i = (int)RES_DESKTOP; i < CDisplaySettings::GetInstance().ResolutionInfoSize(); i++)
--    {
--      RESOLUTION_INFO res = CDisplaySettings::GetInstance().GetResolutionInfo(i);
--      if(res.iScreenWidth == searchWidth && res.iScreenHeight == searchHeight && (res.dwFlags & mode))
--        return true;
--    }
--  }
--  // otherwise just consider current mode
--  else
--  {
--     if (curr.dwFlags & mode)
--       return true;
--  }
--
--  return false;
--}
--
- bool CWinSystemEGL::ClampToGUIDisplayLimits(int &width, int &height)
- {
-   width = width > m_nWidth ? m_nWidth : width;
-diff --git a/xbmc/windowing/egl/WinSystemEGL.h b/xbmc/windowing/egl/WinSystemEGL.h
-index 9d4baf6..1ec4225 100644
---- a/xbmc/windowing/egl/WinSystemEGL.h
-+++ b/xbmc/windowing/egl/WinSystemEGL.h
-@@ -59,7 +59,6 @@ public:
-   virtual void  Register(IDispResource *resource);
-   virtual void  Unregister(IDispResource *resource);
- 
--  virtual bool  Support3D(int width, int height, uint32_t mode)     const;
-   virtual bool  ClampToGUIDisplayLimits(int &width, int &height);
- 
-   EGLConfig     GetEGLConfig();
-
-From ad81921b2e03b01bed2d40f0f1aff697cb48fa56 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 16 Dec 2015 21:08:34 +0000
-Subject: [PATCH 16/93] Avoid switching stereo mode based on current display
- mode
-
----
- xbmc/guilib/GraphicContext.cpp | 24 ------------------------
- 1 file changed, 24 deletions(-)
-
-diff --git a/xbmc/guilib/GraphicContext.cpp b/xbmc/guilib/GraphicContext.cpp
-index a1b8812..60979bb 100644
---- a/xbmc/guilib/GraphicContext.cpp
-+++ b/xbmc/guilib/GraphicContext.cpp
-@@ -429,30 +429,6 @@ void CGraphicContext::SetVideoResolutionInternal(RESOLUTION res, bool forceUpdat
-   Lock();
- 
-   RESOLUTION_INFO info_org  = CDisplaySettings::GetInstance().GetResolutionInfo(res);
--  RESOLUTION_INFO info_last = CDisplaySettings::GetInstance().GetResolutionInfo(lastRes);
--
--  RENDER_STEREO_MODE stereo_mode = m_stereoMode;
--
--  // if the new resolution is an actual stereo mode, switch to that
--  // if the old resolution was an actual stereo mode and renderer is still in old 3D mode, switch to no 3d mode
--  if (info_org.dwFlags & D3DPRESENTFLAG_MODE3DTB)
--    stereo_mode = RENDER_STEREO_MODE_SPLIT_HORIZONTAL;
--  else if (info_org.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
--    stereo_mode = RENDER_STEREO_MODE_SPLIT_VERTICAL;
--  else if ((info_last.dwFlags & D3DPRESENTFLAG_MODE3DTB)
--        && m_stereoMode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    stereo_mode = RENDER_STEREO_MODE_OFF;
--  else if ((info_last.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
--        && m_stereoMode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    stereo_mode = RENDER_STEREO_MODE_OFF;
--
--  if(stereo_mode != m_stereoMode)
--  {
--    m_stereoView     = RENDER_STEREO_VIEW_OFF;
--    m_stereoMode     = stereo_mode;
--    m_nextStereoMode = stereo_mode;
--    CSettings::GetInstance().SetInt(CSettings::SETTING_VIDEOSCREEN_STEREOSCOPICMODE, (int)m_stereoMode);
--  }
- 
-   RESOLUTION_INFO info_mod = GetResInfo(res);
- 
-
-From be69b44990015a874305ef96e7fbdef7f815599e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 21 Jun 2015 18:53:29 +0100
-Subject: [PATCH 17/93] Drop reporting 3D modes and just use current rendering
- mode to request 3D signalling
-
-[rbp] Add ntsc version of 48Hz mode
-
-[rbp] Extract the correct resolution with Pi LCD
-
-[rpb] Change order or CEA and DMT mode probing so CEA modes are preferred
-
-[rbp] Allow interlaced resolutions into supported hdmi mode list 2
----
- xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp | 119 +++++++++---------------
- xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h   |   4 +-
- 2 files changed, 47 insertions(+), 76 deletions(-)
-
-diff --git a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-index 443d037..ee29770 100644
---- a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-+++ b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-@@ -28,6 +28,9 @@
- #include "linux/RBP.h"
- #include "utils/StringUtils.h"
- #include "settings/Settings.h"
-+#include "guilib/GraphicContext.h"
-+#include "guilib/StereoscopicsManager.h"
-+#include "rendering/RenderSystem.h"
- #include <cassert>
- 
- #ifndef __VIDEOCORE4__
-@@ -185,12 +188,13 @@ bool CEGLNativeTypeRaspberryPI::GetNativeResolution(RESOLUTION_INFO *res) const
- }
- 
- #if defined(TARGET_RASPBERRY_PI)
--int CEGLNativeTypeRaspberryPI::FindMatchingResolution(const RESOLUTION_INFO &res, const std::vector<RESOLUTION_INFO> &resolutions)
-+int CEGLNativeTypeRaspberryPI::FindMatchingResolution(const RESOLUTION_INFO &res, const std::vector<RESOLUTION_INFO> &resolutions, bool desktop)
- {
-+  uint32_t mask = desktop ? D3DPRESENTFLAG_MODEMASK : D3DPRESENTFLAG_MODE3DSBS|D3DPRESENTFLAG_MODE3DTB;
-   for (int i = 0; i < (int)resolutions.size(); i++)
-   {
-     if(resolutions[i].iScreenWidth == res.iScreenWidth && resolutions[i].iScreenHeight == res.iScreenHeight && resolutions[i].fRefreshRate == res.fRefreshRate &&
--      (resolutions[i].dwFlags & D3DPRESENTFLAG_MODEMASK) == (res.dwFlags & D3DPRESENTFLAG_MODEMASK))
-+      (resolutions[i].dwFlags & mask) == (res.dwFlags & mask))
-     {
-        return i;
-     }
-@@ -200,13 +204,14 @@ int CEGLNativeTypeRaspberryPI::FindMatchingResolution(const RESOLUTION_INFO &res
- #endif
- 
- #if defined(TARGET_RASPBERRY_PI)
--int CEGLNativeTypeRaspberryPI::AddUniqueResolution(RESOLUTION_INFO &res, std::vector<RESOLUTION_INFO> &resolutions)
-+int CEGLNativeTypeRaspberryPI::AddUniqueResolution(RESOLUTION_INFO &res, std::vector<RESOLUTION_INFO> &resolutions, bool desktop /* = false */)
- {
-   SetResolutionString(res);
--  int i = FindMatchingResolution(res, resolutions);
-+  int i = FindMatchingResolution(res, resolutions, desktop);
-   if (i>=0)
-   {  // don't replace a progressive resolution with an interlaced one of same resolution
--     resolutions[i] = res;
-+    if (!(res.dwFlags & D3DPRESENTFLAG_INTERLACED))
-+      resolutions[i] = res;
-   }
-   else
-   {
-@@ -224,25 +229,28 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
- 
-   DestroyDispmaxWindow();
- 
-+  RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
-   if(GETFLAGS_GROUP(res.dwFlags) && GETFLAGS_MODE(res.dwFlags))
-   {
-+    uint32_t mode3d = HDMI_3D_FORMAT_NONE;
-     sem_init(&m_tv_synced, 0, 0);
-     m_DllBcmHost->vc_tv_register_callback(CallbackTvServiceCallback, this);
- 
--    if (res.dwFlags & (D3DPRESENTFLAG_MODE3DSBS|D3DPRESENTFLAG_MODE3DTB))
-+    if (stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL || stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-     {
-       /* inform TV of any 3D settings. Note this property just applies to next hdmi mode change, so no need to call for 2D modes */
-       HDMI_PROPERTY_PARAM_T property;
-       property.property = HDMI_PROPERTY_3D_STRUCTURE;
-       if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOSCREEN_FRAMEPACKING) && CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC) && res.fRefreshRate <= 30.0f)
-         property.param1 = HDMI_3D_FORMAT_FRAME_PACKING;
--      else if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-+      else if (stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-         property.param1 = HDMI_3D_FORMAT_SBS_HALF;
--      else if (res.dwFlags & D3DPRESENTFLAG_MODE3DTB)
-+      else if (stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-         property.param1 = HDMI_3D_FORMAT_TB_HALF;
-       else
-         property.param1 = HDMI_3D_FORMAT_NONE;
-       property.param2 = 0;
-+      mode3d = property.param1;
-       vc_tv_hdmi_set_property(&property);
-     }
- 
-@@ -261,19 +269,19 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
- 
-     if (success == 0)
-     {
--      CLog::Log(LOGDEBUG, "EGL set HDMI mode (%d,%d)=%d%s%s\n",
-+      CLog::Log(LOGDEBUG, "EGL set HDMI mode (%d,%d)=%d %s%s\n",
-                           GETFLAGS_GROUP(res.dwFlags), GETFLAGS_MODE(res.dwFlags), success,
--                          (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS) ? " SBS":"",
--                          (res.dwFlags & D3DPRESENTFLAG_MODE3DTB) ? " TB":"");
-+                          CStereoscopicsManager::GetInstance().ConvertGuiStereoModeToString(stereo_mode),
-+                          mode3d==HDMI_3D_FORMAT_FRAME_PACKING ? " FP" : mode3d==HDMI_3D_FORMAT_SBS_HALF ? " SBS" : mode3d==HDMI_3D_FORMAT_TB_HALF ? " TB" : "");
- 
-       sem_wait(&m_tv_synced);
-     }
-     else
-     {
--      CLog::Log(LOGERROR, "EGL failed to set HDMI mode (%d,%d)=%d%s%s\n",
-+      CLog::Log(LOGERROR, "EGL failed to set HDMI mode (%d,%d)=%d %s%s\n",
-                           GETFLAGS_GROUP(res.dwFlags), GETFLAGS_MODE(res.dwFlags), success,
--                          (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS) ? " SBS":"",
--                          (res.dwFlags & D3DPRESENTFLAG_MODE3DTB) ? " TB":"");
-+                          CStereoscopicsManager::GetInstance().ConvertGuiStereoModeToString(stereo_mode),
-+                          mode3d==HDMI_3D_FORMAT_FRAME_PACKING ? " FP" : mode3d==HDMI_3D_FORMAT_SBS_HALF ? " SBS" : mode3d==HDMI_3D_FORMAT_TB_HALF ? " TB" : "");
-     }
-     m_DllBcmHost->vc_tv_unregister_callback(CallbackTvServiceCallback);
-     sem_destroy(&m_tv_synced);
-@@ -336,9 +344,9 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
-   DISPMANX_TRANSFORM_T transform = DISPMANX_NO_ROTATE;
-   DISPMANX_UPDATE_HANDLE_T dispman_update = m_DllBcmHost->vc_dispmanx_update_start(0);
- 
--  if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-+  if (stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-     transform = DISPMANX_STEREOSCOPIC_SBS;
--  else if (res.dwFlags & D3DPRESENTFLAG_MODE3DTB)
-+  else if (stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-     transform = DISPMANX_STEREOSCOPIC_TB;
-   else
-     transform = DISPMANX_STEREOSCOPIC_MONO;
-@@ -445,10 +453,8 @@ static void SetResolutionString(RESOLUTION_INFO &res)
-   res.iWidth = gui_width;
-   res.iHeight = gui_height;
- 
--  res.strMode = StringUtils::Format("%dx%d (%dx%d) @ %.2f%s%s%s - Full Screen", res.iScreenWidth, res.iScreenHeight, res.iWidth, res.iHeight, res.fRefreshRate,
--    res.dwFlags & D3DPRESENTFLAG_INTERLACED ? "i" : "",
--    res.dwFlags & D3DPRESENTFLAG_MODE3DTB   ? " 3DTB" : "",
--    res.dwFlags & D3DPRESENTFLAG_MODE3DSBS  ? " 3DSBS" : "");
-+  res.strMode = StringUtils::Format("%dx%d (%dx%d) @ %.2f%s - Full Screen", res.iScreenWidth, res.iScreenHeight, res.iWidth, res.iHeight, res.fRefreshRate,
-+    res.dwFlags & D3DPRESENTFLAG_INTERLACED ? "i" : "");
- }
- 
- static SDTV_ASPECT_T get_sdtv_aspect_from_display_aspect(float display_aspect)
-@@ -503,17 +509,6 @@ bool CEGLNativeTypeRaspberryPI::ProbeResolutions(std::vector<RESOLUTION_INFO> &r
-       m_desktopRes.iScreenHeight= tv_state.display.hdmi.height;
-       m_desktopRes.dwFlags      = MAKEFLAGS(tv_state.display.hdmi.group, tv_state.display.hdmi.mode, tv_state.display.hdmi.scan_mode);
-       m_desktopRes.fPixelRatio  = tv_state.display.hdmi.display_options.aspect == 0 ? 1.0f : get_display_aspect_ratio((HDMI_ASPECT_T)tv_state.display.hdmi.display_options.aspect) / ((float)m_desktopRes.iScreenWidth / (float)m_desktopRes.iScreenHeight);
--      // Also add 3D flags
--      if (tv_state.display.hdmi.format_3d == HDMI_3D_FORMAT_SBS_HALF)
--      {
--        m_desktopRes.dwFlags |= D3DPRESENTFLAG_MODE3DSBS;
--        m_desktopRes.fPixelRatio *= 2.0;
--      }
--      else if (tv_state.display.hdmi.format_3d == HDMI_3D_FORMAT_TB_HALF)
--      {
--        m_desktopRes.dwFlags |= D3DPRESENTFLAG_MODE3DTB;
--        m_desktopRes.fPixelRatio *= 0.5;
--      }
-       HDMI_PROPERTY_PARAM_T property;
-       property.property = HDMI_PROPERTY_PIXEL_CLOCK_TYPE;
-       vc_tv_hdmi_get_property(&property);
-@@ -531,6 +526,18 @@ bool CEGLNativeTypeRaspberryPI::ProbeResolutions(std::vector<RESOLUTION_INFO> &r
-       m_desktopRes.fRefreshRate = (float)tv_state.display.sdtv.frame_rate;
-       m_desktopRes.fPixelRatio  = tv_state.display.hdmi.display_options.aspect == 0 ? 1.0f : get_display_aspect_ratio((SDTV_ASPECT_T)tv_state.display.sdtv.display_options.aspect) / ((float)m_desktopRes.iScreenWidth / (float)m_desktopRes.iScreenHeight);
-     }
-+    else if ((tv_state.state & VC_LCD_ATTACHED_DEFAULT) != 0) // lcd
-+    {
-+      m_desktopRes.iScreen      = 0;
-+      m_desktopRes.bFullScreen  = true;
-+      m_desktopRes.iWidth       = tv_state.display.sdtv.width;
-+      m_desktopRes.iHeight      = tv_state.display.sdtv.height;
-+      m_desktopRes.iScreenWidth = tv_state.display.sdtv.width;
-+      m_desktopRes.iScreenHeight= tv_state.display.sdtv.height;
-+      m_desktopRes.dwFlags      = MAKEFLAGS(HDMI_RES_GROUP_INVALID, 0, 0);
-+      m_desktopRes.fRefreshRate = (float)tv_state.display.sdtv.frame_rate;
-+      m_desktopRes.fPixelRatio  = tv_state.display.hdmi.display_options.aspect == 0 ? 1.0f : get_display_aspect_ratio((SDTV_ASPECT_T)tv_state.display.sdtv.display_options.aspect) / ((float)m_desktopRes.iScreenWidth / (float)m_desktopRes.iScreenHeight);
-+    }
- 
-     SetResolutionString(m_desktopRes);
- 
-@@ -541,11 +548,13 @@ bool CEGLNativeTypeRaspberryPI::ProbeResolutions(std::vector<RESOLUTION_INFO> &r
-     CLog::Log(LOGDEBUG, "EGL initial desktop resolution %s (%.2f)\n", m_desktopRes.strMode.c_str(), m_desktopRes.fPixelRatio);
-   }
- 
--  GetSupportedModes(HDMI_RES_GROUP_CEA, resolutions);
--  GetSupportedModes(HDMI_RES_GROUP_DMT, resolutions);
--
-+  if(GETFLAGS_GROUP(m_desktopRes.dwFlags) && GETFLAGS_MODE(m_desktopRes.dwFlags))
-   {
--    AddUniqueResolution(m_desktopRes, resolutions);
-+    GetSupportedModes(HDMI_RES_GROUP_DMT, resolutions);
-+    GetSupportedModes(HDMI_RES_GROUP_CEA, resolutions);
-+  }
-+  {
-+    AddUniqueResolution(m_desktopRes, resolutions, true);
-     CLog::Log(LOGDEBUG, "EGL probe resolution %s:%x\n", m_desktopRes.strMode.c_str(), m_desktopRes.dwFlags);
-   }
- 
-@@ -638,54 +647,16 @@ void CEGLNativeTypeRaspberryPI::GetSupportedModes(HDMI_RES_GROUP_T group, std::v
-       if (!m_desktopRes.dwFlags && prefer_group == group && prefer_mode == tv->code)
-         m_desktopRes = res;
- 
--      if (res.dwFlags & D3DPRESENTFLAG_INTERLACED)
--        continue;
--
-       AddUniqueResolution(res, resolutions);
-       CLog::Log(LOGDEBUG, "EGL mode %d: %s (%.2f) %s%s:%x\n", i, res.strMode.c_str(), res.fPixelRatio,
-           tv->native ? "N" : "", tv->scan_mode ? "I" : "", tv->code);
- 
--      if (tv->frame_rate == 24 || tv->frame_rate == 30 || tv->frame_rate == 60)
-+      if (tv->frame_rate == 24 || tv->frame_rate == 30 || tv->frame_rate == 48 || tv->frame_rate == 60 || tv->frame_rate == 72)
-       {
-         RESOLUTION_INFO res2 = res;
-         res2.fRefreshRate  = (float)tv->frame_rate * (1000.0f/1001.0f);
-         AddUniqueResolution(res2, resolutions);
-       }
--
--      // Also add 3D versions of modes
--      if (tv->struct_3d_mask & HDMI_3D_STRUCT_SIDE_BY_SIDE_HALF_HORIZONTAL)
--      {
--        RESOLUTION_INFO res2 = res;
--        res2.dwFlags |= D3DPRESENTFLAG_MODE3DSBS;
--        res2.fPixelRatio    = get_display_aspect_ratio((HDMI_ASPECT_T)tv->aspect_ratio) / ((float)res2.iScreenWidth / (float)res2.iScreenHeight);
--        res2.fPixelRatio   *= 2.0f;
--        res2.iSubtitles    = (int)(0.965 * res2.iHeight);
--
--        AddUniqueResolution(res2, resolutions);
--        CLog::Log(LOGDEBUG, "EGL mode %d: %s (%.2f)\n", i, res2.strMode.c_str(), res2.fPixelRatio);
--        if (tv->frame_rate == 24 || tv->frame_rate == 30 || tv->frame_rate == 60)
--        {
--          res2.fRefreshRate  = (float)tv->frame_rate * (1000.0f/1001.0f);
--          AddUniqueResolution(res2, resolutions);
--        }
--      }
--      if (tv->struct_3d_mask & HDMI_3D_STRUCT_TOP_AND_BOTTOM)
--      {
--        RESOLUTION_INFO res2 = res;
--        res2.dwFlags |= D3DPRESENTFLAG_MODE3DTB;
--        res2.fPixelRatio    = get_display_aspect_ratio((HDMI_ASPECT_T)tv->aspect_ratio) / ((float)res2.iScreenWidth / (float)res2.iScreenHeight);
--        res2.fPixelRatio   *= 0.5f;
--        res2.iSubtitles    = (int)(0.965 * res2.iHeight);
--
--        AddUniqueResolution(res2, resolutions);
--        CLog::Log(LOGDEBUG, "EGL mode %d: %s (%.2f)\n", i, res2.strMode.c_str(), res2.fPixelRatio);
--        if (tv->frame_rate == 24 || tv->frame_rate == 30 || tv->frame_rate == 60)
--        {
--          res2.fRefreshRate  = (float)tv->frame_rate * (1000.0f/1001.0f);
--          AddUniqueResolution(res2, resolutions);
--        }
--
--      }
-     }
-   }
-   if (supported_modes)
-diff --git a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h
-index a0acb1a..e5bcae7 100644
---- a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h
-+++ b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h
-@@ -69,7 +69,7 @@ private:
-   static void CallbackTvServiceCallback(void *userdata, uint32_t reason, uint32_t param1, uint32_t param2);
- 
-   void DestroyDispmaxWindow();
--  int FindMatchingResolution(const RESOLUTION_INFO &res, const std::vector<RESOLUTION_INFO> &resolutions);
--  int AddUniqueResolution(RESOLUTION_INFO &res, std::vector<RESOLUTION_INFO> &resolutions);
-+  int FindMatchingResolution(const RESOLUTION_INFO &res, const std::vector<RESOLUTION_INFO> &resolutions, bool desktop);
-+  int AddUniqueResolution(RESOLUTION_INFO &res, std::vector<RESOLUTION_INFO> &resolutions, bool desktop = false);
- #endif
- };
-
-From 5d3349935e282c6d4faef746a5b8a9934676d4c6 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 22 Jun 2015 16:27:15 +0100
-Subject: [PATCH 18/93] Consider stereomode when creating a new window
-
-We might be changing from a non-3D to a 3D mode
----
- xbmc/windowing/egl/WinSystemEGL.cpp | 6 +++++-
- xbmc/windowing/egl/WinSystemEGL.h   | 1 +
- 2 files changed, 6 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/windowing/egl/WinSystemEGL.cpp b/xbmc/windowing/egl/WinSystemEGL.cpp
-index 718fb4c..a0b285c 100644
---- a/xbmc/windowing/egl/WinSystemEGL.cpp
-+++ b/xbmc/windowing/egl/WinSystemEGL.cpp
-@@ -52,6 +52,7 @@ CWinSystemEGL::CWinSystemEGL() : CWinSystemBase()
-   m_surface           = EGL_NO_SURFACE;
-   m_context           = EGL_NO_CONTEXT;
-   m_config            = NULL;
-+  m_stereo_mode       = RENDER_STEREO_MODE_OFF;
- 
-   m_egl               = NULL;
-   m_iVSyncMode        = 0;
-@@ -273,6 +274,7 @@ bool CWinSystemEGL::CreateNewWindow(const std::string& name, bool fullScreen, RE
- {
-   RESOLUTION_INFO current_resolution;
-   current_resolution.iWidth = current_resolution.iHeight = 0;
-+  RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
- 
-   m_nWidth        = res.iWidth;
-   m_nHeight       = res.iHeight;
-@@ -284,12 +286,14 @@ bool CWinSystemEGL::CreateNewWindow(const std::string& name, bool fullScreen, RE
-     current_resolution.iWidth == res.iWidth && current_resolution.iHeight == res.iHeight &&
-     current_resolution.iScreenWidth == res.iScreenWidth && current_resolution.iScreenHeight == res.iScreenHeight &&
-     m_bFullScreen == fullScreen && current_resolution.fRefreshRate == res.fRefreshRate &&
--    (current_resolution.dwFlags & D3DPRESENTFLAG_MODEMASK) == (res.dwFlags & D3DPRESENTFLAG_MODEMASK))
-+    (current_resolution.dwFlags & D3DPRESENTFLAG_MODEMASK) == (res.dwFlags & D3DPRESENTFLAG_MODEMASK) &&
-+    m_stereo_mode == stereo_mode)
-   {
-     CLog::Log(LOGDEBUG, "CWinSystemEGL::CreateNewWindow: No need to create a new window");
-     return true;
-   }
- 
-+  m_stereo_mode = stereo_mode;
-   m_bFullScreen   = fullScreen;
-   // Destroy any existing window
-   if (m_surface != EGL_NO_SURFACE)
-diff --git a/xbmc/windowing/egl/WinSystemEGL.h b/xbmc/windowing/egl/WinSystemEGL.h
-index 1ec4225..a33dedc 100644
---- a/xbmc/windowing/egl/WinSystemEGL.h
-+++ b/xbmc/windowing/egl/WinSystemEGL.h
-@@ -78,6 +78,7 @@ protected:
-   EGLSurface            m_surface;
-   EGLContext            m_context;
-   EGLConfig             m_config;
-+  RENDER_STEREO_MODE    m_stereo_mode;
- 
-   CEGLWrapper           *m_egl;
-   std::string           m_extensions;
-
-From 5d836aad86bfed970e902005bae5761415cec58d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 7 Apr 2014 18:19:32 +0100
-Subject: [PATCH 19/93] [rbp/omxplayer] When opening a stream don't try to
- update gui so often
-
----
- xbmc/dialogs/GUIDialogBusy.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/dialogs/GUIDialogBusy.cpp b/xbmc/dialogs/GUIDialogBusy.cpp
-index 6816b45..6cc5a8f 100644
---- a/xbmc/dialogs/GUIDialogBusy.cpp
-+++ b/xbmc/dialogs/GUIDialogBusy.cpp
-@@ -69,7 +69,11 @@ bool CGUIDialogBusy::WaitOnEvent(CEvent &event, unsigned int displaytime /* = 10
-     {
-       dialog->Open();
- 
-+#ifdef TARGET_RASPBERRY_PI
-+      while(!event.WaitMSec(100))
-+#else
-       while(!event.WaitMSec(1))
-+#endif
-       {
-         g_windowManager.ProcessRenderLoop(false);
-         if (allowCancel && dialog->IsCanceled())
-
-From e01575ea1b07d19332017fca0e1a51389b78d93d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 29 Apr 2014 15:23:22 +0100
-Subject: [PATCH 20/93] [ffmpeg] Speed up wtv index creation
-
-The index creation is O(N^2) with number of entries (typically thousands).
-On a Pi this can take more than 60 seconds to execute for a recording of a few hours.
-
-By replacing with an O(N) loop, this takes virtually zero time
----
- tools/depends/target/ffmpeg/Makefile               |  3 +-
- .../ffmpeg_Speed_up_wtv_index_creation.patch       | 47 ++++++++++++++++++++++
- 2 files changed, 49 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch
-
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index ae932ce..fcfc553 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -1,6 +1,6 @@
- include ../../Makefile.include
- include FFMPEG-VERSION
--DEPS= ../../Makefile.include FFMPEG-VERSION Makefile
-+DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -70,6 +70,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	rm -rf $(PLATFORM); mkdir -p $(PLATFORM)
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
- 	cd $(PLATFORM); sed -i".bak" -e "s%pkg_config_default=pkg-config%export PKG_CONFIG_LIBDIR=$(PREFIX)/lib/pkgconfig \&\& pkg_config_default=$(NATIVEPREFIX)/bin/pkg-config%" configure
-+	cd $(PLATFORM); patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
- 	./configure $(ffmpg_config)
-diff --git a/tools/depends/target/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch b/tools/depends/target/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch
-new file mode 100644
-index 0000000..d829898
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch
-@@ -0,0 +1,47 @@
-+commit 0e7427498cb1131671f6fe9d054245ae7e5a36f5
-+Author: popcornmix <popcornmix@gmail.com>
-+Date:   Tue Mar 25 19:43:07 2014 +0000
-+
-+    [ffmpeg] Speed up wtv index creation
-+
-+    The index creation is O(N^2) with number of entries (typically thousands).
-+    On a Pi this can take more than 60 seconds to execute for a recording of a few hours.
-+
-+    By replacing with an O(N) loop, this takes virtually zero time
-+
-+diff --git a/libavformat/wtvdec.c b/libavformat/wtvdec.c
-+index e423370..70898bd 100644
-+--- a/libavformat/wtvdec.c
-++++ b/libavformat/wtvdec.c
-+@@ -980,21 +980,23 @@ static int read_header(AVFormatContext *s)
-+                 pb = wtvfile_open(s, root, root_size, ff_timeline_table_0_entries_Events_le16);
-+                 if (pb) {
-+                     int i;
-++                    AVIndexEntry *e = wtv->index_entries;
-++                    AVIndexEntry *e_end = wtv->index_entries + wtv->nb_index_entries - 1;
-++                    uint64_t last_position = 0;
-+                     while (1) {
-+                         uint64_t frame_nb = avio_rl64(pb);
-+                         uint64_t position = avio_rl64(pb);
-++                        while (frame_nb > e->size && e <= e_end) {
-++                           e->pos = last_position;
-++                           e++;
-++                        }
-+                         if (avio_feof(pb))
-+                             break;
-+-                        for (i = wtv->nb_index_entries - 1; i >= 0; i--) {
-+-                            AVIndexEntry *e = wtv->index_entries + i;
-+-                            if (frame_nb > e->size)
-+-                                break;
-+-                            if (position > e->pos)
-+-                                e->pos = position;
-+-                        }
-++                        last_position = position;
-+                     }
-++                    e_end->pos = last_position;
-+                     wtvfile_close(pb);
-+-                    st->duration = wtv->index_entries[wtv->nb_index_entries - 1].timestamp;
-++                    st->duration = e_end->timestamp;
-+                 }
-+             }
-+         }
-
-From a29142db6e36056fd988b3199747c0da0dab78a0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 8 Mar 2014 15:36:06 +0000
-Subject: [PATCH 21/93] [hifiberry] Hack: force it to be recognised as IEC958
- capable to enable passthrough options
-
----
- xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-index e22db7a..0120bd5 100644
---- a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-+++ b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-@@ -1342,6 +1342,10 @@ void CAESinkALSA::EnumerateDevice(AEDeviceInfoList &list, const std::string &dev
-     if (snd_card_get_name(cardNr, &cardName) == 0)
-       info.m_displayName = cardName;
- 
-+    // hack: hifiberry digi doesn't correctly report as iec958 device. Needs fixing in kernel driver
-+    if (info.m_displayName == "snd_rpi_hifiberry_digi")
-+      info.m_deviceType = AE_DEVTYPE_IEC958;
-+
-     if (info.m_deviceType == AE_DEVTYPE_HDMI && info.m_displayName.size() > 5 &&
-         info.m_displayName.substr(info.m_displayName.size()-5) == " HDMI")
-     {
-
-From cad0f33be5e0b5989ece0863ba96158dbf5174d9 Mon Sep 17 00:00:00 2001
-From: Ben Avison <bavison@riscosopen.org>
-Date: Thu, 1 May 2014 16:28:39 +0100
-Subject: [PATCH 22/93] Improved file buffering in CArchive
-
-Even though memcpy is typically inlined by the compiler into byte/word loads
-and stores (at least for release builds), the frequency with which 1, 2 and 4
-byte loads/stores are encountered in cases where the size is *not*
-determinable at compile time is still high enough that it's worth handling
-these specially. On the ARM1176JZF-S in the Raspberry Pi, this improves the
-total time to open a library (in the case where it's fetched from a CArchive)
-by around 4%.
-
-It should be noted that this code uses 16-bit and 32-bit word loads and
-stores that are not necessarily aligned to their respective widths. It is
-possible that there are some architectures out there which do not support
-this, although all ARMs since ARMv6 have supported it (and ARMs earlier than
-that are probably not powerful enough to be good targets for XBMC).
----
- xbmc/utils/Archive.h | 16 ++++++++++++++++
- 1 file changed, 16 insertions(+)
-
-diff --git a/xbmc/utils/Archive.h b/xbmc/utils/Archive.h
-index 6ed0f8f..8506d95 100644
---- a/xbmc/utils/Archive.h
-+++ b/xbmc/utils/Archive.h
-@@ -154,9 +154,17 @@ protected:
-      * than waiting until we attempt to put more data into an already full buffer */
-     if (m_BufferRemain > size)
-     {
-+      switch (size)
-+      {
-+      case 1: *m_BufferPos++ = *ptr; m_BufferRemain--; break;
-+      case 2: *(uint16_t *) m_BufferPos = *(const uint16_t *) ptr; m_BufferPos += 2; m_BufferRemain -= 2; break;
-+      case 4: *(uint32_t *) m_BufferPos = *(const uint32_t *) ptr; m_BufferPos += 4; m_BufferRemain -= 4; break;
-+      default:
-       memcpy(m_BufferPos, ptr, size);
-       m_BufferPos += size;
-       m_BufferRemain -= size;
-+      break;
-+      }
-       return *this;
-     }
-     else
-@@ -171,9 +179,17 @@ protected:
-     /* Note, refilling the buffer is deferred until we know we need to read more from it */
-     if (m_BufferRemain >= size)
-     {
-+      switch (size)
-+      {
-+      case 1: *ptr = *m_BufferPos++; m_BufferRemain--; break;
-+      case 2: *(uint16_t *) ptr = *(const uint16_t *) m_BufferPos; m_BufferPos += 2; m_BufferRemain -= 2; break;
-+      case 4: *(uint32_t *) ptr = *(const uint32_t *) m_BufferPos; m_BufferPos += 4; m_BufferRemain -= 4; break;
-+      default:
-       memcpy(ptr, m_BufferPos, size);
-       m_BufferPos += size;
-       m_BufferRemain -= size;
-+      break;
-+      }
-       return *this;
-     }
-     else
-
-From 17eebeec762e4f1c921d886b6863ac4a21cdb2f0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 12 Aug 2014 00:31:36 +0100
-Subject: [PATCH 23/93] [omxcodec] Don't force software codec with dvds
-
----
- xbmc/cores/dvdplayer/DVDPlayer.cpp | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayer.cpp b/xbmc/cores/dvdplayer/DVDPlayer.cpp
-index 4ffe33a..4b09e8f 100644
---- a/xbmc/cores/dvdplayer/DVDPlayer.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayer.cpp
-@@ -3501,7 +3501,9 @@ bool CDVDPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
-       hint.aspect = aspect;
-       hint.forced_aspect = true;
-     }
-+#ifndef TARGET_RASPBERRY_PI
-     hint.software = true;
-+#endif
-   }
-   else if (m_pInputStream && m_pInputStream->IsStreamType(DVDSTREAM_TYPE_PVRMANAGER))
-   {
-
-From 9da36b4157459cc72529ef6be5721f1ff6920ef6 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 10 Aug 2014 16:45:16 +0100
-Subject: [PATCH 24/93] filesystem: Make support of browsing into archives
- optional
-
-The ability to browse, scan and play content in archives can cause problems on low powered/low memory devices.
-It's quite common to see reports of a large rar file that causes xbmc to crash with an out-of-memory error when browsing or scanning.
-It also can be slow as any archive in the directory is opened and extracted.
-
-This causes issues for people who scan library with archives disabled, then subsequently enable it.
-The library has the .rar files in which don't play without removing and re-adding.
-
-We'll let people who don't use archives disable it manually
----
- addons/resource.language.en_gb/resources/strings.po | 9 +++++++++
- system/settings/settings.xml                        | 5 +++++
- xbmc/filesystem/FileDirectoryFactory.cpp            | 4 ++++
- 3 files changed, 18 insertions(+)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 01173ca..e908209 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -18036,6 +18036,15 @@ msgstr ""
- #: system/settings/rbp.xml
- msgctxt "#38010"
- msgid "GPU accelerated"
-+
-+#: system/settings/settings.xml
-+msgctxt "#38020"
-+msgid "Support browsing into archives"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38021"
-+msgid "Allow viewing and playing files in archives (e.g. zip, rar)"
- msgstr ""
- 
- #. Setting #38011 "Videos -> Library -> Show All Items entry"
-diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 76c9a33..7ca534d 100644
---- a/system/settings/settings.xml
-+++ b/system/settings/settings.xml
-@@ -317,6 +317,11 @@
-           <default>false</default>
-           <control type="toggle" />
-         </setting>
-+        <setting id="filelists.browsearchives" type="boolean" label="38020" help="38021">
-+          <level>1</level>
-+          <default>true</default>
-+          <control type="toggle" />
-+        </setting>
-       </group>
-     </category>
-     <category id="screensaver" label="360" help="36128">
-diff --git a/xbmc/filesystem/FileDirectoryFactory.cpp b/xbmc/filesystem/FileDirectoryFactory.cpp
-index a1d4ee4..4929283 100644
---- a/xbmc/filesystem/FileDirectoryFactory.cpp
-+++ b/xbmc/filesystem/FileDirectoryFactory.cpp
-@@ -40,6 +40,7 @@
- #include "playlists/PlayListFactory.h"
- #include "Directory.h"
- #include "File.h"
-+#include "settings/Settings.h"
- #include "FileItem.h"
- #include "utils/StringUtils.h"
- #include "URL.h"
-@@ -112,6 +113,8 @@ IFileDirectory* CFileDirectoryFactory::Create(const CURL& url, CFileItem* pItem,
-     return NULL;
-   }
- #endif
-+  if (CSettings::GetInstance().GetBool("filelists.browsearchives"))
-+  {
-   if (url.IsFileType("zip"))
-   {
-     CURL zipURL = URIUtils::CreateArchivePath("zip", url);
-@@ -185,6 +188,7 @@ IFileDirectory* CFileDirectoryFactory::Create(const CURL& url, CFileItem* pItem,
-     }
-     return NULL;
-   }
-+  }
-   if (url.IsFileType("xbt"))
-   {
-     CURL xbtUrl = URIUtils::CreateArchivePath("xbt", url);
-
-From b0231de02ec1821e136d75ff0f3986aaed8f0d92 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 27 Oct 2014 13:06:57 +0000
-Subject: [PATCH 25/93] [rbp] Make cachemembuffersize default depend on memory
- size
-
----
- xbmc/linux/RBP.cpp                 | 10 ++++++++++
- xbmc/linux/RBP.h                   |  1 +
- xbmc/settings/AdvancedSettings.cpp | 12 +++++++++++-
- 3 files changed, 22 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index 2a9a93a..6c5288d 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -23,6 +23,7 @@
- 
- #include <assert.h>
- #include "settings/Settings.h"
-+#include "settings/AdvancedSettings.h"
- #include "utils/log.h"
- 
- #include "cores/omxplayer/OMXImage.h"
-@@ -43,6 +44,12 @@ CRBP::~CRBP()
-   delete m_DllBcmHost;
- }
- 
-+void CRBP::InitializeSettings()
-+{
-+  if (m_initialized && g_advancedSettings.m_cacheMemBufferSize == ~0U)
-+    g_advancedSettings.m_cacheMemBufferSize = m_arm_mem < 256 ? 1024 * 1024 * 2 : 1024 * 1024 * 20;
-+}
-+
- bool CRBP::Initialize()
- {
-   CSingleLock lock (m_critSection);
-@@ -82,6 +89,8 @@ bool CRBP::Initialize()
-   if (!m_gui_resolution_limit)
-     m_gui_resolution_limit = m_gpu_mem < 128 ? 720:1080;
- 
-+  InitializeSettings();
-+
-   g_OMXImage.Initialize();
-   m_omx_image_init = true;
-   return true;
-@@ -94,6 +103,7 @@ void CRBP::LogFirmwareVerison()
-   response[sizeof(response) - 1] = '\0';
-   CLog::Log(LOGNOTICE, "Raspberry PI firmware version: %s", response);
-   CLog::Log(LOGNOTICE, "ARM mem: %dMB GPU mem: %dMB MPG2:%d WVC1:%d", m_arm_mem, m_gpu_mem, m_codec_mpg2_enabled, m_codec_wvc1_enabled);
-+  CLog::Log(LOGNOTICE, "cacheMemBufferSize: %dMB",  g_advancedSettings.m_cacheMemBufferSize >> 20);
-   m_DllBcmHost->vc_gencmd(response, sizeof response, "get_config int");
-   response[sizeof(response) - 1] = '\0';
-   CLog::Log(LOGNOTICE, "Config:\n%s", response);
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index 9559914..7fc8b42 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -48,6 +48,7 @@ public:
-   ~CRBP();
- 
-   bool Initialize();
-+  void InitializeSettings();
-   void LogFirmwareVerison();
-   void Deinitialize();
-   int GetArmMem() { return m_arm_mem; }
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index bc3aa8c..562757e 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -49,6 +49,9 @@
- #if defined(TARGET_DARWIN_IOS)
- #include "osx/DarwinUtils.h"
- #endif
-+#if defined(TARGET_RASPBERRY_PI)
-+#include "linux/RBP.h"
-+#endif
- 
- using namespace ADDON;
- using namespace XFILE;
-@@ -344,7 +347,12 @@ void CAdvancedSettings::Initialize()
-   m_bPVRAutoScanIconsUserSet       = false;
-   m_iPVRNumericChannelSwitchTimeout = 1000;
- 
-+#ifdef TARGET_RASPBERRY_PI
-+  // want default to be memory dependent, but interface to gpu not available yet, so set in RBP.cpp
-+  m_cacheMemBufferSize = ~0;
-+#else
-   m_cacheMemBufferSize = 1024 * 1024 * 20;
-+#endif
-   m_networkBufferMode = 0; // Default (buffer all internet streams/filesystems)
-   // the following setting determines the readRate of a player data
-   // as multiply of the default data read rate
-@@ -399,7 +407,9 @@ void CAdvancedSettings::Initialize()
-   #endif
- 
-   m_userAgent = g_sysinfo.GetUserAgent();
--
-+#ifdef TARGET_RASPBERRY_PI
-+  g_RBP.InitializeSettings();
-+#endif
-   m_initialized = true;
- }
- 
-
-From 6d080c7c800d2e1120b46c5490d64d80b4e63ad4 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 30 May 2014 14:58:43 +0100
-Subject: [PATCH 26/93] [settings] Experiment: Report DESKTOP resolution in
- video settings
-
----
- xbmc/settings/DisplaySettings.cpp | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/xbmc/settings/DisplaySettings.cpp b/xbmc/settings/DisplaySettings.cpp
-index 7993a73..761385b 100644
---- a/xbmc/settings/DisplaySettings.cpp
-+++ b/xbmc/settings/DisplaySettings.cpp
-@@ -683,6 +683,9 @@ void CDisplaySettings::SettingOptionsResolutionsFiller(const CSetting *setting,
-     std::vector<RESOLUTION_WHR> resolutions = g_Windowing.ScreenResolutions(info.iScreen, info.fRefreshRate);
-     for (std::vector<RESOLUTION_WHR>::const_iterator resolution = resolutions.begin(); resolution != resolutions.end(); ++resolution)
-     {
-+if (resolution->ResInfo_Index == RES_DESKTOP)
-+      list.push_back(std::make_pair(StringUtils::Format("DESKTOP"), resolution->ResInfo_Index));
-+else
-       list.push_back(std::make_pair(
-         StringUtils::Format("%dx%d%s", resolution->width, resolution->height,
-                             ModeFlagsToString(resolution->flags, false).c_str()),
-
-From 80f582c6ced4a245d0cabb97a3e9fefc009e096d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 19 Sep 2014 11:54:49 +0100
-Subject: [PATCH 27/93] [dvdplayer/rbp] Add pi specific option to maintain
- vsync with pll adjustment
-
-New A/V sync option in settings/video/playback to do "Adjust PLL".
-This uses video clock (so perfect video syncing) but avoids having to resample
-or drop/dupe audio packets which is normally required.
-
-Needed updated firmware
-
-[dvdplayeraudio] Add advancedsetting for configuring max pll adjustment
-
-[dvdplayer] Allow pll adjustment to go higher, but tail off more gradually
----
- .../resource.language.en_gb/resources/strings.po   | 23 +++++++++++++-
- system/settings/settings.xml                       | 14 +++++++++
- xbmc/cores/AudioEngine/Utils/AEUtil.h              |  3 +-
- xbmc/cores/dvdplayer/DVDPlayerAudio.cpp            | 36 +++++++++++++++++++---
- xbmc/cores/dvdplayer/DVDPlayerAudio.h              |  3 ++
- xbmc/linux/RBP.cpp                                 | 13 ++++++++
- xbmc/linux/RBP.h                                   |  3 ++
- xbmc/settings/AdvancedSettings.cpp                 |  2 ++
- xbmc/settings/AdvancedSettings.h                   |  1 +
- 9 files changed, 91 insertions(+), 7 deletions(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index e908209..0f45ea0 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -6609,7 +6609,22 @@ msgctxt "#13459"
- msgid "Use OMXPlayer for decoding of video files."
- msgstr ""
- 
--#empty strings from id 13460 to 13504
-+#empty strings from id 13460 to 13499
-+
-+#: system/settings/settings.xml
-+msgctxt "#13500"
-+msgid "A/V sync method"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#13503"
-+msgid "Resample audio"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#13504"
-+msgid "Adjust PLL"
-+msgstr ""
- 
- #: system/settings/settings.xml
- msgctxt "#13505"
-@@ -18141,3 +18156,9 @@ msgstr ""
- msgctxt "#38030"
- msgid "This option uses frame-packing to output full resolution for 3D through HDMI.[CR]Enabling this improves quality of Multiview Video Coding (MVC) videos, but may not be supported by all displays."
- msgstr ""
-+
-+#. Description of setting "Videos -> Playback -> A/V sync method" with label #13500
-+#: system/settings/settings.xml
-+msgctxt "#38006"
-+msgid "Audio has to stay in sync, this can either be done by resampling, or adjusting the PLL"
-+msgstr ""
-diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 7ca534d..1b57136 100644
---- a/system/settings/settings.xml
-+++ b/system/settings/settings.xml
-@@ -595,6 +595,20 @@
-           <default>false</default>
-           <control type="toggle" />
-         </setting>
-+        <setting id="videoplayer.synctype" type="integer" parent="videoplayer.usedisplayasclock" label="13500" help="38006">
-+          <level>2</level>
-+          <default>2</default> <!-- SYNC_RESAMPLE -->
-+          <constraints>
-+            <options>
-+              <option label="13503">2</option> <!-- SYNC_RESAMPLE -->
-+              <option label="13504">3</option> <!-- SYNC_PLLADJUST -->
-+            </options>
-+          </constraints>
-+          <dependencies>
-+            <dependency type="enable" setting="videoplayer.usedisplayasclock" operator="is">true</dependency>
-+          </dependencies>
-+          <control type="spinner" format="string" />
-+        </setting>
-         <setting id="videoplayer.errorinaspect" type="integer" label="22021" help="36170">
-           <level>2</level>
-           <default>0</default>
-diff --git a/xbmc/cores/AudioEngine/Utils/AEUtil.h b/xbmc/cores/AudioEngine/Utils/AEUtil.h
-index 56c0a1f..f7f63b5 100644
---- a/xbmc/cores/AudioEngine/Utils/AEUtil.h
-+++ b/xbmc/cores/AudioEngine/Utils/AEUtil.h
-@@ -57,7 +57,8 @@ enum AVSync
- {
-   SYNC_DISCON   = 0,
-   SYNC_SKIPDUP,
--  SYNC_RESAMPLE
-+  SYNC_RESAMPLE,
-+  SYNC_PLLADJUST
- };
- 
- struct AEDelayStatus
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp b/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-index 39074ff..97a23a6 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-@@ -24,6 +24,7 @@
- #include "DVDCodecs/Audio/DVDAudioCodec.h"
- #include "DVDCodecs/DVDFactoryCodec.h"
- #include "settings/Settings.h"
-+#include "settings/AdvancedSettings.h"
- #include "video/VideoReferenceClock.h"
- #include "utils/log.h"
- #include "utils/MathUtils.h"
-@@ -109,6 +110,9 @@ CDVDPlayerAudio::CDVDPlayerAudio(CDVDClock* pClock, CDVDMessageQueue& parent)
-   m_started = false;
-   m_silence = false;
-   m_resampleratio = 1.0;
-+  m_plladjust = 1.0;
-+  m_last_plladjust = 1.0;
-+  m_last_error = 0.0;
-   m_synctype = SYNC_DISCON;
-   m_setsynctype = SYNC_DISCON;
-   m_prevsynctype = -1;
-@@ -182,11 +186,13 @@ void CDVDPlayerAudio::OpenStream( CDVDStreamInfo &hints, CDVDAudioCodec* codec )
-   m_synctype = SYNC_DISCON;
-   m_setsynctype = SYNC_DISCON;
-   if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEDISPLAYASCLOCK))
--    m_setsynctype = SYNC_RESAMPLE;
-+    m_setsynctype = CSettings::GetInstance().GetInt("videoplayer.synctype");
-   m_prevsynctype = -1;
- 
-   m_error = 0;
-   m_errors.Flush();
-+  m_plladjust = 1.0;
-+  m_last_plladjust = 1.0;
-   m_integral = 0;
-   m_prevskipped = false;
-   m_syncclock = true;
-@@ -229,7 +235,6 @@ void CDVDPlayerAudio::CloseStream(bool bWaitForBuffers)
- 
-   // uninit queue
-   m_messageQueue.End();
--
-   CLog::Log(LOGNOTICE, "Deleting audio codec");
-   if (m_pAudioCodec)
-   {
-@@ -482,7 +487,11 @@ void CDVDPlayerAudio::UpdatePlayerInfo()
-   //print the inverse of the resample ratio, since that makes more sense
-   //if the resample ratio is 0.5, then we're playing twice as fast
-   if (m_synctype == SYNC_RESAMPLE)
--    s << ", rr:" << std::fixed << std::setprecision(5) << 1.0 / m_resampleratio;
-+    s << ", rr:" << std::fixed << std::setprecision(5) << 1.0 / m_resampleratio << ", err:" << std::fixed << std::setprecision(1) << m_last_error * 1e-3 << "ms";
-+  if (m_synctype == SYNC_SKIPDUP)
-+    s << ", err:" << std::fixed << std::setprecision(1) << m_last_error * 1e-3 << "ms";
-+  if (m_synctype == SYNC_PLLADJUST)
-+    s << ", pll:" << std::fixed << std::setprecision(5) << g_RBP.GetAdjustHDMIClock() << ", err:" << std::fixed << std::setprecision(1) << m_last_error * 1e-3 << "ms";
- 
-   s << ", att:" << std::fixed << std::setprecision(1) << log(GetCurrentAttenuation()) * 20.0f << " dB";
- 
-@@ -637,8 +646,8 @@ void CDVDPlayerAudio::SetSyncType(bool passthrough)
- 
-   if (m_synctype != m_prevsynctype)
-   {
--    const char *synctypes[] = {"clock feedback", "skip/duplicate", "resample", "invalid"};
--    int synctype = (m_synctype >= 0 && m_synctype <= 2) ? m_synctype : 3;
-+    const char *synctypes[] = {"clock feedback", "skip/duplicate", "resample", "pll adjust", "invalid"};
-+    int synctype = (m_synctype >= 0 && m_synctype <= 3) ? m_synctype : 4;
-     CLog::Log(LOGDEBUG, "CDVDPlayerAudio:: synctype set to %i: %s", m_synctype, synctypes[synctype]);
-     m_prevsynctype = m_synctype;
-   }
-@@ -748,7 +757,19 @@ void CDVDPlayerAudio::HandleSyncError(double duration)
-       proportional = m_error / DVD_TIME_BASE / proportionaldiv;
-     }
-     m_resampleratio = 1.0 / m_pClock->GetClockSpeed() + proportional + m_integral;
-+    CLog::Log(LOGDEBUG, "CDVDPlayerAudio::%s rr:%.5f error:%.3fms", __FUNCTION__, m_resampleratio, m_error * 1e-3);
-+  }
-+  else if (m_synctype == SYNC_PLLADJUST)
-+  {
-+#if defined(TARGET_RASPBERRY_PI)
-+    double e = std::max(std::min(m_error / DVD_MSEC_TO_TIME(50), 1.0), -1.0);
-+    double adjust = g_advancedSettings.m_maxPllAdjust * 1e-6;
-+    m_plladjust = 1.0 + e * adjust;
-+    m_last_plladjust = g_RBP.AdjustHDMIClock(m_plladjust);
-+    CLog::Log(LOGDEBUG, "CDVDPlayerAudio::%s pll:%.5f (%.5f) error:%.6f e:%.6f a:%f", __FUNCTION__, m_plladjust, m_last_plladjust, m_error, e * adjust, adjust );
-+#endif
-   }
-+  m_last_error = m_error;
- }
- 
- bool CDVDPlayerAudio::OutputPacket(DVDAudioFrame &audioframe)
-@@ -801,6 +822,7 @@ bool CDVDPlayerAudio::OutputPacket(DVDAudioFrame &audioframe)
-     {
-       m_dvdAudio.AddPackets(audioframe);
-     }
-+    m_plladjust = 1.0;
-   }
-   else if (m_synctype == SYNC_DISCON)
-   {
-@@ -835,6 +857,10 @@ bool CDVDPlayerAudio::OutputPacket(DVDAudioFrame &audioframe)
-     m_dvdAudio.SetResampleRatio(m_resampleratio);
-     m_dvdAudio.AddPackets(audioframe);
-   }
-+  else if (m_synctype == SYNC_PLLADJUST)
-+  {
-+    m_dvdAudio.AddPackets(audioframe);
-+  }
- 
-   return true;
- }
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerAudio.h b/xbmc/cores/dvdplayer/DVDPlayerAudio.h
-index 014574d..409b2d7 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerAudio.h
-+++ b/xbmc/cores/dvdplayer/DVDPlayerAudio.h
-@@ -228,6 +228,9 @@ protected:
-   bool   m_prevskipped;
-   double m_maxspeedadjust;
-   double m_resampleratio; //resample ratio when using SYNC_RESAMPLE, used for the codec info
-+  double m_plladjust;    // for display using SYNC_PLLADJUST
-+  double m_last_error;    // for display using SYNC_PLLADJUST
-+  double m_last_plladjust;    // for display using SYNC_PLLADJUST
- 
-   struct SInfo
-   {
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index 6c5288d..a79d6d9 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -35,6 +35,7 @@ CRBP::CRBP()
-   m_DllBcmHost      = new DllBcmHost();
-   m_OMX             = new COMXCore();
-   m_display = DISPMANX_NO_HANDLE;
-+  m_last_pll_adjust = 1.0;
- }
- 
- CRBP::~CRBP()
-@@ -225,4 +226,16 @@ void CRBP::Deinitialize()
-   m_initialized     = false;
-   m_omx_initialized = false;
- }
-+
-+double CRBP::AdjustHDMIClock(double adjust)
-+{
-+  char response[80];
-+  vc_gencmd(response, sizeof response, "hdmi_adjust_clock %f", adjust);
-+  char *p = strchr(response, '=');
-+  if (p)
-+    m_last_pll_adjust = atof(p+1);
-+  CLog::Log(LOGDEBUG, "CRBP::%s(%.4f) = %.4f", __func__, adjust, m_last_pll_adjust);
-+  return m_last_pll_adjust;
-+}
-+
- #endif
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index 7fc8b42..db2fade 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -64,6 +64,8 @@ public:
-   unsigned char *CaptureDisplay(int width, int height, int *stride, bool swap_red_blue, bool video_only = true);
-   DllOMX *GetDllOMX() { return m_OMX ? m_OMX->GetDll() : NULL; }
-   void WaitVsync();
-+  double AdjustHDMIClock(double adjust);
-+  double GetAdjustHDMIClock() { return m_last_pll_adjust; }
- 
- private:
-   DllBcmHost *m_DllBcmHost;
-@@ -80,6 +82,7 @@ private:
-   CEvent     m_vsync;
-   class DllLibOMXCore;
-   CCriticalSection m_critSection;
-+  double m_last_pll_adjust;
- };
- 
- extern CRBP g_RBP;
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 562757e..22b8459 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -116,6 +116,7 @@ void CAdvancedSettings::Initialize()
-   m_audioHeadRoom = 0;
-   m_ac3Gain = 12.0f;
-   m_audioApplyDrc = -1.0f;
-+  m_maxPllAdjust = 1000;
-   m_dvdplayerIgnoreDTSinWAV = false;
- 
-   //default hold time of 25 ms, this allows a 20 hertz sine to pass undistorted
-@@ -467,6 +468,7 @@ void CAdvancedSettings::ParseSettingsFile(const std::string &file)
-   if (pElement)
-   {
-     XMLUtils::GetFloat(pElement, "ac3downmixgain", m_ac3Gain, -96.0f, 96.0f);
-+    XMLUtils::GetInt(pElement, "maxplladjust", m_maxPllAdjust, 0, 1000000);
-     XMLUtils::GetInt(pElement, "headroom", m_audioHeadRoom, 0, 12);
-     XMLUtils::GetString(pElement, "defaultplayer", m_audioDefaultPlayer);
-     // 101 on purpose - can be used to never automark as watched
-diff --git a/xbmc/settings/AdvancedSettings.h b/xbmc/settings/AdvancedSettings.h
-index 6475350..93de9bd 100644
---- a/xbmc/settings/AdvancedSettings.h
-+++ b/xbmc/settings/AdvancedSettings.h
-@@ -143,6 +143,7 @@ class CAdvancedSettings : public ISettingCallback, public ISettingsHandler
- 
-     int m_audioHeadRoom;
-     float m_ac3Gain;
-+    int m_maxPllAdjust;
-     std::string m_audioDefaultPlayer;
-     float m_audioPlayCountMinimumPercent;
-     bool m_dvdplayerIgnoreDTSinWAV;
-
-From cecfb10575958e190cf3c6394ff2158bff6fe52a Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 24 Sep 2014 23:13:52 +0100
-Subject: [PATCH 28/93] [audio] Add settings option to boost centre channel
- when downmixing
-
-This allows a dB volume increase to be added to centre channel.
-This can help improve dialgue in the presence of background music/effects.
-It can go up to 30dB for testing purposes, but value of 6 is probably more reasonable.
-It is recommended to ensure "Normalise levels on downmix" is enabled when boosting by large values to avoid clipping.
-
-Should work with Pi Sink (dvdplayer/paplayer) and omxplayer
----
- addons/resource.language.en_gb/resources/strings.po       | 15 +++++++++++++++
- system/settings/settings.xml                              | 12 ++++++++++++
- .../Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp           |  7 +++++++
- .../AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp   |  6 ++++++
- xbmc/cores/omxplayer/OMXAudio.cpp                         |  6 ++++++
- 5 files changed, 46 insertions(+)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 0f45ea0..cc486da 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -18162,3 +18162,18 @@ msgstr ""
- msgctxt "#38006"
- msgid "Audio has to stay in sync, this can either be done by resampling, or adjusting the PLL"
- msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38007"
-+msgid "Boost centre channel when downmixing"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38008"
-+msgid "Increase this value to make the dialogue louder compared to background sounds when downmixing multichannel audio"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38009"
-+msgid "%i dB"
-+msgstr ""
-diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 1b57136..918e8bf 100644
---- a/system/settings/settings.xml
-+++ b/system/settings/settings.xml
-@@ -2558,6 +2558,18 @@
-           <default>true</default>
-           <control type="toggle" />
-         </setting>
-+         <setting id="audiooutput.boostcenter" type="integer" label="38007" help="38008">
-+          <level>2</level>
-+          <default>0</default>
-+          <constraints>
-+            <minimum>0</minimum>
-+            <step>1</step>
-+            <maximum>30</maximum>
-+          </constraints>
-+          <control type="spinner" format="string">
-+            <formatlabel>38009</formatlabel>
-+          </control>
-+        </setting>
-         <setting id="audiooutput.processquality" type="integer" label="13505" help="36169">
-           <requirement>HAS_AE_QUALITY_LEVELS</requirement>
-           <level>2</level>
-diff --git a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp
-index e4ddf9e..625ea88 100644
---- a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp
-+++ b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp
-@@ -20,6 +20,7 @@
- 
- #include "cores/AudioEngine/Utils/AEUtil.h"
- #include "ActiveAEResampleFFMPEG.h"
-+#include "settings/Settings.h"
- #include "utils/log.h"
- 
- extern "C" {
-@@ -105,6 +106,12 @@ bool CActiveAEResampleFFMPEG::Init(uint64_t dst_chan_layout, int dst_channels, i
-   {
-      av_opt_set_double(m_pContext, "rematrix_maxval", 1.0, 0);
-   }
-+  int boost_center = CSettings::GetInstance().GetInt("audiooutput.boostcenter");
-+  if (boost_center)
-+  {
-+    float gain = pow(10.0f, ((float)(-3 + boost_center))/20.0f);
-+    av_opt_set_double(m_pContext, "center_mix_level", gain, 0);
-+  }
- 
-   if (remapLayout)
-   {
-diff --git a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp
-index 7807149..698a6ae 100644
---- a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp
-+++ b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp
-@@ -164,6 +164,12 @@ bool CActiveAEResamplePi::Init(uint64_t dst_chan_layout, int dst_channels, int d
-   {
-     av_opt_set_double(m_pContext, "rematrix_maxval", 1.0, 0);
-   }
-+  int boost_center = CSettings::GetInstance().GetInt("audiooutput.boostcenter");
-+  if (boost_center)
-+  {
-+    float gain = pow(10.0f, ((float)(-3 + boost_center))/20.0f);
-+    av_opt_set_double(m_pContext, "center_mix_level", gain, 0);
-+  }
- 
-   if (remapLayout)
-   {
-diff --git a/xbmc/cores/omxplayer/OMXAudio.cpp b/xbmc/cores/omxplayer/OMXAudio.cpp
-index 08b1b84..70d0866 100644
---- a/xbmc/cores/omxplayer/OMXAudio.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudio.cpp
-@@ -641,6 +641,12 @@ bool COMXAudio::Initialize(AEAudioFormat format, OMXClock *clock, CDVDStreamInfo
-     {
-        av_opt_set_double(m_pContext, "rematrix_maxval", 1.0, 0);
-     }
-+    int boost_center = CSettings::GetInstance().GetInt("audiooutput.boostcenter");
-+    if (boost_center)
-+    {
-+      float gain = pow(10.0f, ((float)(-3 + boost_center))/20.0f);
-+      av_opt_set_double(m_pContext, "center_mix_level", gain, 0);
-+    }
- 
-     // stereo upmix
-     if (upmix && m_src_channels == 2 && m_dst_channels > 2)
-
-From cd089d7903e1fd4e0812ad817126a19d07fa896d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 27 Oct 2014 15:23:51 +0000
-Subject: [PATCH 29/93] [rbp] Default extract thumbnails to false
-
-It can take 80 seconds for a single file on a Pi. It can cause crashes with out-of-memory errors.
-It genereates a lot of support issues. Best to default to disabled and let users enable it if they must
----
- system/settings/rbp.xml | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
-diff --git a/system/settings/rbp.xml b/system/settings/rbp.xml
-index 7a170c2..1506035 100644
---- a/system/settings/rbp.xml
-+++ b/system/settings/rbp.xml
-@@ -42,6 +42,16 @@
-         </setting>
-       </group>
-     </category>
-+    <category id="myvideos">
-+      <group id="1">
-+        <setting id="myvideos.extractflags">
-+          <default>false</default>
-+        </setting>
-+        <setting id="myvideos.extractthumb">
-+          <default>false</default>
-+        </setting>
-+      </group>
-+    </category>
-   </section>
- 
-   <section id="system">
-
-From c0b8590f78235540d82d478334c7f30fae417754 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 10 Feb 2015 15:29:16 +0000
-Subject: [PATCH 30/93] [libcec] Add repeating keypress patch from popcornmix'
- repo
-
----
- tools/depends/target/libcec/Makefile         |   1 +
- tools/depends/target/libcec/popcornmix.patch | 859 +++++++++++++++++++++++++++
- 2 files changed, 860 insertions(+)
- create mode 100644 tools/depends/target/libcec/popcornmix.patch
-
-diff --git a/tools/depends/target/libcec/Makefile b/tools/depends/target/libcec/Makefile
-index f54af9e..ddf9963 100644
---- a/tools/depends/target/libcec/Makefile
-+++ b/tools/depends/target/libcec/Makefile
-@@ -21,6 +21,7 @@ $(TARBALLS_LOCATION)/$(ARCHIVE):
- $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	rm -rf $(PLATFORM); mkdir -p $(PLATFORM)/build
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
-+	cd $(PLATFORM); patch -p1 < ../popcornmix.patch
- 	cd $(PLATFORM)/build; $(CMAKE) -DBUILD_SHARED_LIBS=1 -DSKIP_PYTHON_WRAPPER:STRING=1 -DCMAKE_INSTALL_LIBDIR=$(PREFIX)/lib ..
- 
- $(LIBDYLIB): $(PLATFORM)
-diff --git a/tools/depends/target/libcec/popcornmix.patch b/tools/depends/target/libcec/popcornmix.patch
-new file mode 100644
-index 0000000..8366a69
---- /dev/null
-+++ b/tools/depends/target/libcec/popcornmix.patch
-@@ -0,0 +1,859 @@
-+From ec982e9800ae312972d306b67779215a2add6cde Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Fri, 24 Oct 2014 13:45:21 +0100
-+Subject: [PATCH 1/6] Make released key polling wait for exact time until key
-+ gets released
-+
-+---
-+ src/libcec/CECClient.cpp    | 16 ++++++++++++++--
-+ src/libcec/CECClient.h      |  2 +-
-+ src/libcec/CECProcessor.cpp |  8 +++++---
-+ src/libcec/LibCEC.cpp       | 10 ++++++++--
-+ src/libcec/LibCEC.h         |  4 +++-
-+ 5 files changed, 31 insertions(+), 9 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index 35c2d3e..e307c0e 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -1067,7 +1067,7 @@ void CCECClient::SetCurrentButton(const cec_user_control_code iButtonCode)
-+   AddKey(key);
-+ }
-+ 
-+-void CCECClient::CheckKeypressTimeout(void)
-++uint16_t CCECClient::CheckKeypressTimeout(void)
-+ {
-+   cec_keypress key;
-+ 
-+@@ -1091,12 +1091,24 @@ void CCECClient::CheckKeypressTimeout(void)
-+     }
-+     else
-+     {
-+-      return;
-++      // time when this keypress will be released and we'd like to be called again
-++      unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-++      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton == comboKey && iTimeoutMs > 0)
-++        timeout = iTimeoutMs - (iNow - m_buttontime) + 1;
-++      else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey)
-++        timeout = CEC_BUTTON_TIMEOUT - (iNow - m_buttontime) + 1;
-++      if (timeout > CEC_PROCESSOR_SIGNAL_WAIT_TIME)
-++      {
-++        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_buttontime*1e-3, CEC_BUTTON_TIMEOUT*1e-3, m_iCurrentButton);
-++        timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-++      }
-++      return timeout;
-+     }
-+   }
-+ 
-+   LIB_CEC->AddLog(CEC_LOG_DEBUG, "key auto-released: %s (%1x)", ToString(key.keycode), key.keycode);
-+   QueueAddKey(key);
-++  return CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+ }
-+ 
-+ bool CCECClient::EnableCallbacks(void *cbParam, ICECCallbacks *callbacks)
-+diff --git a/src/libcec/CECClient.h b/src/libcec/CECClient.h
-+index 12f8a3b..c9ce5e3 100644
-+--- a/src/libcec/CECClient.h
-++++ b/src/libcec/CECClient.h
-+@@ -272,7 +272,7 @@ namespace CEC
-+     virtual void                  AddKey(bool bSendComboKey = false);
-+     virtual void                  AddKey(const cec_keypress &key);
-+     virtual void                  SetCurrentButton(const cec_user_control_code iButtonCode);
-+-    virtual void                  CheckKeypressTimeout(void);
-++    virtual uint16_t              CheckKeypressTimeout(void);
-+     virtual void                  SourceActivated(const cec_logical_address logicalAddress);
-+     virtual void                  SourceDeactivated(const cec_logical_address logicalAddress);
-+ 
-+diff --git a/src/libcec/CECProcessor.cpp b/src/libcec/CECProcessor.cpp
-+index 99f71aa..604b950 100644
-+--- a/src/libcec/CECProcessor.cpp
-++++ b/src/libcec/CECProcessor.cpp
-+@@ -52,7 +52,6 @@
-+ using namespace CEC;
-+ using namespace PLATFORM;
-+ 
-+-#define CEC_PROCESSOR_SIGNAL_WAIT_TIME 1000
-+ #define ACTIVE_SOURCE_CHECK_INTERVAL   500
-+ #define TV_PRESENT_CHECK_INTERVAL      30000
-+ 
-+@@ -260,6 +259,7 @@ bool CCECProcessor::OnCommandReceived(const cec_command &command)
-+ 
-+ void *CCECProcessor::Process(void)
-+ {
-++  uint16_t timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   m_libcec->AddLog(CEC_LOG_DEBUG, "processor thread started");
-+ 
-+   if (!m_connCheck)
-+@@ -274,13 +274,13 @@ void *CCECProcessor::Process(void)
-+   while (!IsStopped() && m_communication->IsOpen())
-+   {
-+     // wait for a new incoming command, and process it
-+-    if (m_inBuffer.Pop(command, CEC_PROCESSOR_SIGNAL_WAIT_TIME))
-++    if (m_inBuffer.Pop(command, timeout))
-+       ProcessCommand(command);
-+ 
-+     if (CECInitialised() && !IsStopped())
-+     {
-+       // check clients for keypress timeouts
-+-      m_libcec->CheckKeypressTimeout();
-++      timeout = m_libcec->CheckKeypressTimeout();
-+ 
-+       // check if we need to replace handlers
-+       ReplaceHandlers();
-+@@ -311,6 +311,8 @@ void *CCECProcessor::Process(void)
-+         tvPresentCheck.Init(TV_PRESENT_CHECK_INTERVAL);
-+       }
-+     }
-++    else
-++      timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   }
-+ 
-+   return NULL;
-+diff --git a/src/libcec/LibCEC.cpp b/src/libcec/LibCEC.cpp
-+index af36b79..5ccb8dd 100644
-+--- a/src/libcec/LibCEC.cpp
-++++ b/src/libcec/LibCEC.cpp
-+@@ -361,11 +361,17 @@ bool CLibCEC::IsValidPhysicalAddress(uint16_t iPhysicalAddress)
-+          iPhysicalAddress <= CEC_MAX_PHYSICAL_ADDRESS;
-+ }
-+ 
-+-void CLibCEC::CheckKeypressTimeout(void)
-++uint16_t CLibCEC::CheckKeypressTimeout(void)
-+ {
-++  uint16_t timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   // check all clients
-+   for (std::vector<CECClientPtr>::iterator it = m_clients.begin(); it != m_clients.end(); it++)
-+-    (*it)->CheckKeypressTimeout();
-++  {
-++    uint16_t t = (*it)->CheckKeypressTimeout();
-++    if (t < timeout)
-++      timeout = t;
-++  }
-++  return timeout;
-+ }
-+ 
-+ void CLibCEC::AddLog(const cec_log_level level, const char *strFormat, ...)
-+diff --git a/src/libcec/LibCEC.h b/src/libcec/LibCEC.h
-+index 6d9a229..d9d1e7b 100644
-+--- a/src/libcec/LibCEC.h
-++++ b/src/libcec/LibCEC.h
-+@@ -39,6 +39,8 @@
-+ #include "CECTypeUtils.h"
-+ #include <memory>
-+ 
-++#define CEC_PROCESSOR_SIGNAL_WAIT_TIME 1000
-++
-+ namespace CEC
-+ {
-+   class CAdapterCommunication;
-+@@ -125,7 +127,7 @@ namespace CEC
-+ 
-+       void AddLog(const cec_log_level level, const char *strFormat, ...);
-+       void AddCommand(const cec_command &command);
-+-      void CheckKeypressTimeout(void);
-++      uint16_t CheckKeypressTimeout(void);
-+       void Alert(const libcec_alert type, const libcec_parameter &param);
-+ 
-+       static bool IsValidPhysicalAddress(uint16_t iPhysicalAddress);
-+-- 
-+1.9.1
-+
-+
-+From 41f0f3ec9ac136da3565c96fd5a7075499f3938d Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Fri, 24 Oct 2014 13:51:34 +0100
-+Subject: [PATCH 2/6] Keep track of time since initial button press and last
-+ button update
-+
-+---
-+ src/libcec/CECClient.cpp | 44 +++++++++++++++++++++++++++-----------------
-+ src/libcec/CECClient.h   |  3 ++-
-+ 2 files changed, 29 insertions(+), 18 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index e307c0e..e7935b9 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -54,7 +54,8 @@ CCECClient::CCECClient(CCECProcessor *processor, const libcec_configuration &con
-+     m_bInitialised(false),
-+     m_bRegistered(false),
-+     m_iCurrentButton(CEC_USER_CONTROL_CODE_UNKNOWN),
-+-    m_buttontime(0),
-++    m_initialButtontime(0),
-++    m_updateButtontime(0),
-+     m_iPreventForwardingPowerOffCommand(0),
-+     m_iLastKeypressTime(0)
-+ {
-+@@ -981,9 +982,10 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */)
-+     CLockObject lock(m_mutex);
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN)
-+     {
-+-      key.duration = (unsigned int) (GetTimeMs() - m_buttontime);
-++      unsigned int duration = (unsigned int) (GetTimeMs() - m_updateButtontime);
-++      key.duration = (unsigned int) (GetTimeMs() - m_initialButtontime);
-+ 
-+-      if (key.duration > m_configuration.iComboKeyTimeoutMs ||
-++      if (duration > m_configuration.iComboKeyTimeoutMs ||
-+           m_configuration.iComboKeyTimeoutMs == 0 ||
-+           m_iCurrentButton != m_configuration.comboKey ||
-+           bSendComboKey)
-+@@ -991,14 +993,15 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */)
-+         key.keycode = m_iCurrentButton;
-+ 
-+         m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-+-        m_buttontime = 0;
-++        m_initialButtontime = 0;
-++        m_updateButtontime = 0;
-+       }
-+     }
-+   }
-+ 
-+   if (key.keycode != CEC_USER_CONTROL_CODE_UNKNOWN)
-+   {
-+-    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key released: %s (%1x)", ToString(key.keycode), key.keycode);
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key released: %s (%1x) D:%dms", ToString(key.keycode), key.keycode, key.duration);
-+     QueueAddKey(key);
-+   }
-+ }
-+@@ -1012,7 +1015,7 @@ void CCECClient::AddKey(const cec_keypress &key)
-+     AddKey();
-+     return;
-+   }
-+-
-++  bool isrepeat = false;
-+   cec_keypress transmitKey(key);
-+   cec_user_control_code comboKey(m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5) ?
-+       m_configuration.comboKey : CEC_USER_CONTROL_CODE_STOP);
-+@@ -1035,22 +1038,27 @@ void CCECClient::AddKey(const cec_keypress &key)
-+         AddKey(true);
-+     }
-+ 
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x) current(%lx) duration(%d)", ToString(transmitKey.keycode), transmitKey.keycode, m_iCurrentButton, key.duration);
-++
-+     if (m_iCurrentButton == key.keycode)
-+     {
-+-      m_buttontime = GetTimeMs();
-++      m_updateButtontime = GetTimeMs();
-++      isrepeat = true;
-+     }
-+     else
-+     {
-+-      AddKey();
-++      if (m_iCurrentButton != transmitKey.keycode)
-++        AddKey();
-+       if (key.duration == 0)
-+       {
-+         m_iCurrentButton = transmitKey.keycode;
-+-        m_buttontime = m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN || key.duration > 0 ? 0 : GetTimeMs();
-++        m_initialButtontime = m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN || key.duration > 0 ? 0 : GetTimeMs();
-++        m_updateButtontime = m_initialButtontime;
-+       }
-+     }
-+   }
-+ 
-+-  if (key.keycode != comboKey || key.duration > 0)
-++  if (!isrepeat && (key.keycode != comboKey || key.duration > 0))
-+   {
-+     LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x)", ToString(transmitKey.keycode), transmitKey.keycode);
-+     QueueAddKey(transmitKey);
-+@@ -1074,32 +1082,34 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+   {
-+     CLockObject lock(m_mutex);
-+     uint64_t iNow = GetTimeMs();
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "%s T:%.3f", __FUNCTION__, iNow*1e-3);
-+     cec_user_control_code comboKey(m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5) ?
-+         m_configuration.comboKey : CEC_USER_CONTROL_CODE_STOP);
-+     uint32_t iTimeoutMs(m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5) ?
-+         m_configuration.iComboKeyTimeoutMs : CEC_DEFAULT_COMBO_TIMEOUT_MS);
-+ 
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_buttontime > iTimeoutMs) ||
-+-          (m_iCurrentButton != comboKey && iNow - m_buttontime > CEC_BUTTON_TIMEOUT)))
-++          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime > iTimeoutMs) ||
-++          (m_iCurrentButton != comboKey && iNow - m_updateButtontime > CEC_BUTTON_TIMEOUT)))
-+     {
-+-      key.duration = (unsigned int) (iNow - m_buttontime);
-++      key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+ 
-+       m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-+-      m_buttontime = 0;
-++      m_initialButtontime = 0;
-++      m_updateButtontime = 0;
-+     }
-+     else
-+     {
-+       // time when this keypress will be released and we'd like to be called again
-+       unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton == comboKey && iTimeoutMs > 0)
-+-        timeout = iTimeoutMs - (iNow - m_buttontime) + 1;
-++        timeout = iTimeoutMs - (iNow - m_updateButtontime) + 1;
-+       else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey)
-+-        timeout = CEC_BUTTON_TIMEOUT - (iNow - m_buttontime) + 1;
-++        timeout = CEC_BUTTON_TIMEOUT - (iNow - m_updateButtontime) + 1;
-+       if (timeout > CEC_PROCESSOR_SIGNAL_WAIT_TIME)
-+       {
-+-        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_buttontime*1e-3, CEC_BUTTON_TIMEOUT*1e-3, m_iCurrentButton);
-++        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_updateButtontime*1e-3, CEC_BUTTON_TIMEOUT*1e-3, m_iCurrentButton);
-+         timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       }
-+       return timeout;
-+diff --git a/src/libcec/CECClient.h b/src/libcec/CECClient.h
-+index c9ce5e3..611c68b 100644
-+--- a/src/libcec/CECClient.h
-++++ b/src/libcec/CECClient.h
-+@@ -404,7 +404,8 @@ namespace CEC
-+     PLATFORM::CMutex      m_mutex;                             /**< mutex for changes to this instance */
-+     PLATFORM::CMutex      m_cbMutex;                           /**< mutex that is held when doing anything with callbacks */
-+     cec_user_control_code m_iCurrentButton;                    /**< the control code of the button that's currently held down (if any) */
-+-    int64_t               m_buttontime;                        /**< the timestamp when the button was pressed (in seconds since epoch), or 0 if none was pressed. */
-++    int64_t               m_initialButtontime;                 /**< the timestamp when the button was initially pressed (in seconds since epoch), or 0 if none was pressed. */
-++    int64_t               m_updateButtontime;                  /**< the timestamp when the button was updated (in seconds since epoch), or 0 if none was pressed. */
-+     int64_t               m_iPreventForwardingPowerOffCommand; /**< prevent forwarding standby commands until this time */
-+     int64_t               m_iLastKeypressTime;                 /**< last time a key press was sent to the client */
-+     cec_keypress          m_lastKeypress;                      /**< the last key press that was sent to the client */
-+-- 
-+1.9.1
-+
-+
-+From 273ead6980b69eddf98810eb1eb33d94a7d74fce Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Tue, 28 Oct 2014 00:09:18 +0000
-+Subject: [PATCH 3/6] Support repeating button presses with configurable repeat
-+ rate
-+
-+---
-+ include/cectypes.h                               |   6 ++
-+ src/libcec/CECClient.cpp                         | 100 +++++++++++++++++++----
-+ src/libcec/CECClient.h                           |   6 +-
-+ src/libcec/implementations/CECCommandHandler.cpp |   2 +-
-+ 4 files changed, 96 insertions(+), 18 deletions(-)
-+
-+diff --git a/include/cectypes.h b/include/cectypes.h
-+index acff259..8f098ef 100644
-+--- a/include/cectypes.h
-++++ b/include/cectypes.h
-+@@ -1493,6 +1493,8 @@ struct libcec_configuration
-+                                                    XXX changed meaning in 2.2.0 to not break binary compatibility. next major (3.0) release will fix it in a nicer way */
-+   cec_user_control_code comboKey;             /*!< key code that initiates combo keys. defaults to CEC_USER_CONTROL_CODE_F1_BLUE. CEC_USER_CONTROL_CODE_UNKNOWN to disable. added in 2.0.5 */
-+   uint32_t              iComboKeyTimeoutMs;   /*!< timeout until the combo key is sent as normal keypress */
-++  uint32_t              iButtonRepeatRateMs;  /*!< rate at which buttons autorepeat. 0 means rely on CEC device */
-++  uint32_t              iButtonReleaseDelayMs;/*!< duration after last update until a button is considered released */
-+ 
-+ #ifdef __cplusplus
-+    libcec_configuration(void) { Clear(); }
-+@@ -1527,6 +1529,8 @@ struct libcec_configuration
-+                  cecVersion                == other.cecVersion &&
-+                  adapterType               == other.adapterType &&
-+                  iDoubleTapTimeout50Ms     == other.iDoubleTapTimeout50Ms &&
-++                 iButtonRepeatRateMs       == other.iButtonRepeatRateMs &&
-++                 iButtonReleaseDelayMs     == other.iButtonReleaseDelayMs &&
-+                  (other.clientVersion <= LIBCEC_VERSION_TO_UINT(2, 0, 4) || comboKey            == other.comboKey) &&
-+                  (other.clientVersion <= LIBCEC_VERSION_TO_UINT(2, 0, 4) || iComboKeyTimeoutMs  == other.iComboKeyTimeoutMs) &&
-+                  (other.clientVersion <  LIBCEC_VERSION_TO_UINT(2, 1, 0) || bPowerOnScreensaver == other.bPowerOnScreensaver));
-+@@ -1567,6 +1571,8 @@ struct libcec_configuration
-+     iDoubleTapTimeout50Ms =           CEC_DOUBLE_TAP_TIMEOUT_50_MS;
-+     comboKey =                        CEC_USER_CONTROL_CODE_STOP;
-+     iComboKeyTimeoutMs =              CEC_DEFAULT_COMBO_TIMEOUT_MS;
-++    iButtonRepeatRateMs =             0;
-++    iButtonReleaseDelayMs =           CEC_BUTTON_TIMEOUT;
-+ 
-+     memset(strDeviceName, 0, 13);
-+     deviceTypes.Clear();
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index e7935b9..598628d 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -56,6 +56,10 @@ CCECClient::CCECClient(CCECProcessor *processor, const libcec_configuration &con
-+     m_iCurrentButton(CEC_USER_CONTROL_CODE_UNKNOWN),
-+     m_initialButtontime(0),
-+     m_updateButtontime(0),
-++    m_repeatButtontime(0),
-++    m_releaseButtontime(0),
-++    m_pressedButtoncount(0),
-++    m_releasedButtoncount(0),
-+     m_iPreventForwardingPowerOffCommand(0),
-+     m_iLastKeypressTime(0)
-+ {
-+@@ -851,6 +855,9 @@ bool CCECClient::GetCurrentConfiguration(libcec_configuration &configuration)
-+   configuration.bMonitorOnly              = m_configuration.bMonitorOnly;
-+   configuration.cecVersion                = m_configuration.cecVersion;
-+   configuration.adapterType               = m_configuration.adapterType;
-++  configuration.iDoubleTapTimeout50Ms     = m_configuration.iDoubleTapTimeout50Ms;
-++  configuration.iButtonRepeatRateMs       = m_configuration.iButtonRepeatRateMs;
-++  configuration.iButtonReleaseDelayMs     = m_configuration.iButtonReleaseDelayMs;
-+ 
-+   return true;
-+ }
-+@@ -894,6 +901,9 @@ bool CCECClient::SetConfiguration(const libcec_configuration &configuration)
-+     m_configuration.cecVersion                 = configuration.cecVersion;
-+     m_configuration.adapterType                = configuration.adapterType;
-+     m_configuration.iDoubleTapTimeout50Ms      = configuration.iDoubleTapTimeout50Ms;
-++    m_configuration.iButtonRepeatRateMs        = configuration.iButtonRepeatRateMs;
-++    m_configuration.iButtonReleaseDelayMs      = configuration.iButtonReleaseDelayMs;
-++
-+     m_configuration.deviceTypes.Add(configuration.deviceTypes[0]);
-+ 
-+     if (m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5))
-+@@ -950,6 +960,7 @@ bool CCECClient::SetConfiguration(const libcec_configuration &configuration)
-+     primary->ActivateSource();
-+   }
-+ 
-++  LIB_CEC->AddLog(CEC_LOG_DEBUG, "%s: %d:%d:%d", __FUNCTION__, DoubleTapTimeoutMS(), m_configuration.iButtonRepeatRateMs, m_configuration.iButtonReleaseDelayMs);
-+   return true;
-+ }
-+ 
-+@@ -973,11 +984,15 @@ void CCECClient::AddCommand(const cec_command &command)
-+   }
-+ }
-+ 
-+-void CCECClient::AddKey(bool bSendComboKey /* = false */)
-++void CCECClient::AddKey(bool bSendComboKey /* = false */, bool bButtonRelease /* = false */)
-+ {
-+   cec_keypress key;
-+   key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+ 
-++  // we ignore button releases when supporting repeating keys
-++  if (bButtonRelease && m_configuration.iButtonRepeatRateMs && m_configuration.iButtonReleaseDelayMs)
-++    return;
-++
-+   {
-+     CLockObject lock(m_mutex);
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN)
-+@@ -995,6 +1010,10 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */)
-+         m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-+         m_initialButtontime = 0;
-+         m_updateButtontime = 0;
-++        m_repeatButtontime = 0;
-++        m_releaseButtontime = 0;
-++        m_pressedButtoncount = 0;
-++        m_releasedButtoncount = 0;
-+       }
-+     }
-+   }
-+@@ -1012,6 +1031,7 @@ void CCECClient::AddKey(const cec_keypress &key)
-+       key.keycode < CEC_USER_CONTROL_CODE_SELECT)
-+   {
-+     // send back the previous key if there is one
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "Unexpected key %s (%1x) D:%dms", ToString(key.keycode), key.keycode, key.duration);
-+     AddKey();
-+     return;
-+   }
-+@@ -1035,7 +1055,10 @@ void CCECClient::AddKey(const cec_keypress &key)
-+         transmitKey.keycode = CEC_USER_CONTROL_CODE_DOT;
-+       // default, send back the previous key
-+       else
-++      {
-++        LIB_CEC->AddLog(CEC_LOG_DEBUG, "Combo key %s (%1x) D%dms:", ToString(key.keycode), key.keycode, key.duration);
-+         AddKey(true);
-++      }
-+     }
-+ 
-+     LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x) current(%lx) duration(%d)", ToString(transmitKey.keycode), transmitKey.keycode, m_iCurrentButton, key.duration);
-+@@ -1043,17 +1066,44 @@ void CCECClient::AddKey(const cec_keypress &key)
-+     if (m_iCurrentButton == key.keycode)
-+     {
-+       m_updateButtontime = GetTimeMs();
-+-      isrepeat = true;
-++      m_releaseButtontime = m_updateButtontime + (m_configuration.iButtonReleaseDelayMs ? m_configuration.iButtonReleaseDelayMs : CEC_BUTTON_TIMEOUT);
-++      // want to have seen some updated before considering a repeat
-++      if (m_configuration.iButtonRepeatRateMs)
-++      {
-++        if (!m_repeatButtontime && m_pressedButtoncount > 1)
-++          m_repeatButtontime = m_initialButtontime + DoubleTapTimeoutMS();
-++        isrepeat = true;
-++      }
-++      m_pressedButtoncount++;
-+     }
-+     else
-+     {
-+       if (m_iCurrentButton != transmitKey.keycode)
-++      {
-++        LIB_CEC->AddLog(CEC_LOG_DEBUG, "Changed key %s (%1x) D:%dms cur:%lx", ToString(transmitKey.keycode), transmitKey.keycode, transmitKey.duration, m_iCurrentButton);
-+         AddKey();
-++      }
-+       if (key.duration == 0)
-+       {
-+         m_iCurrentButton = transmitKey.keycode;
-+-        m_initialButtontime = m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN || key.duration > 0 ? 0 : GetTimeMs();
-+-        m_updateButtontime = m_initialButtontime;
-++        if (m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN)
-++        {
-++          m_initialButtontime = 0;
-++          m_updateButtontime = 0;
-++          m_repeatButtontime = 0;
-++          m_releaseButtontime = 0;
-++          m_pressedButtoncount = 0;
-++          m_releasedButtoncount = 0;
-++        }
-++        else
-++        {
-++          m_initialButtontime = GetTimeMs();
-++          m_updateButtontime = m_initialButtontime;
-++          m_repeatButtontime = 0; // set this on next update
-++          m_releaseButtontime = m_initialButtontime + (m_configuration.iButtonReleaseDelayMs ? m_configuration.iButtonReleaseDelayMs : CEC_BUTTON_TIMEOUT);
-++          m_pressedButtoncount = 1;
-++          m_releasedButtoncount = 0;
-++        }
-+       }
-+     }
-+   }
-+@@ -1072,12 +1122,16 @@ void CCECClient::SetCurrentButton(const cec_user_control_code iButtonCode)
-+   key.duration = 0;
-+   key.keycode = iButtonCode;
-+ 
-++  LIB_CEC->AddLog(CEC_LOG_DEBUG, "SetCurrentButton %s (%1x) D:%dms cur:%lx", ToString(key.keycode), key.keycode, key.duration);
-+   AddKey(key);
-+ }
-+ 
-+ uint16_t CCECClient::CheckKeypressTimeout(void)
-+ {
-++  // time when we'd like to be called again
-++  unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   cec_keypress key;
-++  key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+ 
-+   {
-+     CLockObject lock(m_mutex);
-+@@ -1089,8 +1143,8 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+         m_configuration.iComboKeyTimeoutMs : CEC_DEFAULT_COMBO_TIMEOUT_MS);
-+ 
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime > iTimeoutMs) ||
-+-          (m_iCurrentButton != comboKey && iNow - m_updateButtontime > CEC_BUTTON_TIMEOUT)))
-++          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs) ||
-++          (m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)))
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+@@ -1098,27 +1152,41 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+       m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-+       m_initialButtontime = 0;
-+       m_updateButtontime = 0;
-++      m_repeatButtontime = 0;
-++      m_releaseButtontime = 0;
-++      m_pressedButtoncount = 0;
-++      m_releasedButtoncount = 0;
-++    }
-++    else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-++          (m_iCurrentButton != comboKey && m_repeatButtontime && iNow >= (uint64_t)m_repeatButtontime))
-++    {
-++      key.duration = 0;
-++      key.keycode = m_iCurrentButton;
-++      m_repeatButtontime = iNow + m_configuration.iButtonRepeatRateMs;
-++      timeout = std::min((uint64_t)timeout, m_repeatButtontime - iNow);
-+     }
-+     else
-+     {
-+-      // time when this keypress will be released and we'd like to be called again
-+-      unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton == comboKey && iTimeoutMs > 0)
-+-        timeout = iTimeoutMs - (iNow - m_updateButtontime) + 1;
-+-      else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey)
-+-        timeout = CEC_BUTTON_TIMEOUT - (iNow - m_updateButtontime) + 1;
-++        timeout = std::min((uint64_t)timeout, m_updateButtontime - iNow + iTimeoutMs);
-++      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey && m_releaseButtontime)
-++        timeout = std::min((uint64_t)timeout, m_releaseButtontime - iNow);
-++      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey && m_repeatButtontime)
-++        timeout = std::min((uint64_t)timeout, m_repeatButtontime - iNow);
-+       if (timeout > CEC_PROCESSOR_SIGNAL_WAIT_TIME)
-+       {
-+-        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_updateButtontime*1e-3, CEC_BUTTON_TIMEOUT*1e-3, m_iCurrentButton);
-++        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_updateButtontime*1e-3, m_releaseButtontime*1e-3, m_iCurrentButton);
-+         timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       }
-+-      return timeout;
-+     }
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key %s: %s (%1x) timeout:%dms (rel:%d,rep:%d,prs:%d,rel:%d)", key.keycode == CEC_USER_CONTROL_CODE_UNKNOWN ? "idle" : key.duration ? "released" : "repeated",
-++        ToString(m_iCurrentButton), m_iCurrentButton, timeout, (int)(m_releaseButtontime ? m_releaseButtontime - iNow : 0), (int)(m_repeatButtontime ? m_repeatButtontime - iNow : 0), m_pressedButtoncount, m_releasedButtoncount);
-+   }
-+ 
-+-  LIB_CEC->AddLog(CEC_LOG_DEBUG, "key auto-released: %s (%1x)", ToString(key.keycode), key.keycode);
-+-  QueueAddKey(key);
-+-  return CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-++  if (key.keycode != CEC_USER_CONTROL_CODE_UNKNOWN)
-++    QueueAddKey(key);
-++
-++  return timeout;
-+ }
-+ 
-+ bool CCECClient::EnableCallbacks(void *cbParam, ICECCallbacks *callbacks)
-+diff --git a/src/libcec/CECClient.h b/src/libcec/CECClient.h
-+index 611c68b..adeb5af 100644
-+--- a/src/libcec/CECClient.h
-++++ b/src/libcec/CECClient.h
-+@@ -269,7 +269,7 @@ namespace CEC
-+     // callbacks
-+     virtual void                  Alert(const libcec_alert type, const libcec_parameter &param) { QueueAlert(type, param); }
-+     virtual void                  AddLog(const cec_log_message &message) { QueueAddLog(message); }
-+-    virtual void                  AddKey(bool bSendComboKey = false);
-++    virtual void                  AddKey(bool bSendComboKey = false, bool bButtonRelease = false);
-+     virtual void                  AddKey(const cec_keypress &key);
-+     virtual void                  SetCurrentButton(const cec_user_control_code iButtonCode);
-+     virtual uint16_t              CheckKeypressTimeout(void);
-+@@ -406,6 +406,10 @@ namespace CEC
-+     cec_user_control_code m_iCurrentButton;                    /**< the control code of the button that's currently held down (if any) */
-+     int64_t               m_initialButtontime;                 /**< the timestamp when the button was initially pressed (in seconds since epoch), or 0 if none was pressed. */
-+     int64_t               m_updateButtontime;                  /**< the timestamp when the button was updated (in seconds since epoch), or 0 if none was pressed. */
-++    int64_t               m_repeatButtontime;                  /**< the timestamp when the button will next repeat (in seconds since epoch), or 0 if repeat is disabled. */
-++    int64_t               m_releaseButtontime;                 /**< the timestamp when the button will be released (in seconds since epoch), or 0 if none was pressed. */
-++    int32_t               m_pressedButtoncount;                /**< the number of times a button released message has been seen for this press. */
-++    int32_t               m_releasedButtoncount;               /**< the number of times a button pressed message has been seen for this press. */
-+     int64_t               m_iPreventForwardingPowerOffCommand; /**< prevent forwarding standby commands until this time */
-+     int64_t               m_iLastKeypressTime;                 /**< last time a key press was sent to the client */
-+     cec_keypress          m_lastKeypress;                      /**< the last key press that was sent to the client */
-+diff --git a/src/libcec/implementations/CECCommandHandler.cpp b/src/libcec/implementations/CECCommandHandler.cpp
-+index 6d6244e..d64186f 100644
-+--- a/src/libcec/implementations/CECCommandHandler.cpp
-++++ b/src/libcec/implementations/CECCommandHandler.cpp
-+@@ -770,7 +770,7 @@ int CCECCommandHandler::HandleUserControlRelease(const cec_command &command)
-+ 
-+   CECClientPtr client = m_processor->GetClient(command.destination);
-+   if (client)
-+-    client->AddKey();
-++    client->AddKey(false, true);
-+ 
-+   return COMMAND_HANDLED;
-+ }
-+-- 
-+1.9.1
-+
-+
-+From 3336d0827f7fd159430f3431642b07090c06c869 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Tue, 28 Oct 2014 01:21:35 +0000
-+Subject: [PATCH 4/6] Skip double press removal. It is handled through other
-+ means.
-+
-+---
-+ src/libcec/CECClient.cpp | 18 +-----------------
-+ src/libcec/CECClient.h   |  2 --
-+ 2 files changed, 1 insertion(+), 19 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index 598628d..dccd874 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -60,11 +60,8 @@ CCECClient::CCECClient(CCECProcessor *processor, const libcec_configuration &con
-+     m_releaseButtontime(0),
-+     m_pressedButtoncount(0),
-+     m_releasedButtoncount(0),
-+-    m_iPreventForwardingPowerOffCommand(0),
-+-    m_iLastKeypressTime(0)
-++    m_iPreventForwardingPowerOffCommand(0)
-+ {
-+-  m_lastKeypress.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+-  m_lastKeypress.duration = 0;
-+   m_configuration.Clear();
-+   // set the initial configuration
-+   SetConfiguration(configuration);
-+@@ -1647,20 +1644,7 @@ void CCECClient::CallbackAddKey(const cec_keypress &key)
-+ {
-+   CLockObject lock(m_cbMutex);
-+   if (m_configuration.callbacks && m_configuration.callbacks->CBCecKeyPress)
-+-  {
-+-    // prevent double taps
-+-    int64_t now = GetTimeMs();
-+-    if (m_lastKeypress.keycode != key.keycode ||
-+-        key.duration > 0 ||
-+-        now - m_iLastKeypressTime >= DoubleTapTimeoutMS())
-+-    {
-+-      // no double tap
-+-      if (key.duration == 0)
-+-        m_iLastKeypressTime = now;
-+-      m_lastKeypress = key;
-+       m_configuration.callbacks->CBCecKeyPress(m_configuration.callbackParam, key);
-+-    }
-+-  }
-+ }
-+ 
-+ void CCECClient::CallbackAddLog(const cec_log_message &message)
-+diff --git a/src/libcec/CECClient.h b/src/libcec/CECClient.h
-+index adeb5af..43a713b 100644
-+--- a/src/libcec/CECClient.h
-++++ b/src/libcec/CECClient.h
-+@@ -411,8 +411,6 @@ namespace CEC
-+     int32_t               m_pressedButtoncount;                /**< the number of times a button released message has been seen for this press. */
-+     int32_t               m_releasedButtoncount;               /**< the number of times a button pressed message has been seen for this press. */
-+     int64_t               m_iPreventForwardingPowerOffCommand; /**< prevent forwarding standby commands until this time */
-+-    int64_t               m_iLastKeypressTime;                 /**< last time a key press was sent to the client */
-+-    cec_keypress          m_lastKeypress;                      /**< the last key press that was sent to the client */
-+     PLATFORM::SyncedBuffer<CCallbackWrap*> m_callbackCalls;
-+   };
-+ }
-+-- 
-+1.9.1
-+
-+
-+From 0dd0234f620a546bfa843172648383f83d88088c Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Mon, 3 Nov 2014 23:28:04 +0000
-+Subject: [PATCH 5/6] Pass through duration on all button repeats
-+
-+---
-+ src/libcec/CECClient.cpp | 34 ++++++++++++++++++++++++----------
-+ 1 file changed, 24 insertions(+), 10 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index dccd874..1946148 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -986,10 +986,6 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */, bool bButtonRelease /*
-+   cec_keypress key;
-+   key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+ 
-+-  // we ignore button releases when supporting repeating keys
-+-  if (bButtonRelease && m_configuration.iButtonRepeatRateMs && m_configuration.iButtonReleaseDelayMs)
-+-    return;
-+-
-+   {
-+     CLockObject lock(m_mutex);
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN)
-+@@ -1015,6 +1011,10 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */, bool bButtonRelease /*
-+     }
-+   }
-+ 
-++  // we don't forward releases when supporting repeating keys
-++  if (bButtonRelease && m_configuration.iButtonRepeatRateMs)
-++    return;
-++
-+   if (key.keycode != CEC_USER_CONTROL_CODE_UNKNOWN)
-+   {
-+     LIB_CEC->AddLog(CEC_LOG_DEBUG, "key released: %s (%1x) D:%dms", ToString(key.keycode), key.keycode, key.duration);
-+@@ -1107,7 +1107,7 @@ void CCECClient::AddKey(const cec_keypress &key)
-+ 
-+   if (!isrepeat && (key.keycode != comboKey || key.duration > 0))
-+   {
-+-    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x)", ToString(transmitKey.keycode), transmitKey.keycode);
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x, %d)", ToString(transmitKey.keycode), transmitKey.keycode, transmitKey.duration);
-+     QueueAddKey(transmitKey);
-+   }
-+ }
-+@@ -1129,6 +1129,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+   unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   cec_keypress key;
-+   key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-++  key.duration = 0;
-+ 
-+   {
-+     CLockObject lock(m_mutex);
-+@@ -1140,8 +1141,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+         m_configuration.iComboKeyTimeoutMs : CEC_DEFAULT_COMBO_TIMEOUT_MS);
-+ 
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs) ||
-+-          (m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)))
-++          m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs)
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+@@ -1155,9 +1155,23 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+       m_releasedButtoncount = 0;
-+     }
-+     else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-++          m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)
-++    {
-++      key.duration = (unsigned int) (iNow - m_initialButtontime);
-++      key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-++
-++      m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-++      m_initialButtontime = 0;
-++      m_updateButtontime = 0;
-++      m_repeatButtontime = 0;
-++      m_releaseButtontime = 0;
-++      m_pressedButtoncount = 0;
-++      m_releasedButtoncount = 0;
-++    }
-++    else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+           (m_iCurrentButton != comboKey && m_repeatButtontime && iNow >= (uint64_t)m_repeatButtontime))
-+     {
-+-      key.duration = 0;
-++      key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+       m_repeatButtontime = iNow + m_configuration.iButtonRepeatRateMs;
-+       timeout = std::min((uint64_t)timeout, m_repeatButtontime - iNow);
-+@@ -1176,8 +1190,8 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+         timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       }
-+     }
-+-    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key %s: %s (%1x) timeout:%dms (rel:%d,rep:%d,prs:%d,rel:%d)", key.keycode == CEC_USER_CONTROL_CODE_UNKNOWN ? "idle" : key.duration ? "released" : "repeated",
-+-        ToString(m_iCurrentButton), m_iCurrentButton, timeout, (int)(m_releaseButtontime ? m_releaseButtontime - iNow : 0), (int)(m_repeatButtontime ? m_repeatButtontime - iNow : 0), m_pressedButtoncount, m_releasedButtoncount);
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "Key %s: %s (duration:%d) (%1x) timeout:%dms (rel:%d,rep:%d,prs:%d,rel:%d)", ToString(m_iCurrentButton), key.keycode == CEC_USER_CONTROL_CODE_UNKNOWN ? "idle" : m_repeatButtontime ? "repeated" : "released", key.duration,
-++        m_iCurrentButton, timeout, (int)(m_releaseButtontime ? m_releaseButtontime - iNow : 0), (int)(m_repeatButtontime ? m_repeatButtontime - iNow : 0), m_pressedButtoncount, m_releasedButtoncount);
-+   }
-+ 
-+   if (key.keycode != CEC_USER_CONTROL_CODE_UNKNOWN)
-+-- 
-+1.9.1
-+
-+
-+From 1ea01f59d8186d4d53af41961aaccbbc11651115 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Wed, 5 Nov 2014 21:04:25 +0000
-+Subject: [PATCH 6/6] squash: Fix for stop needing to be pressed twice
-+
-+---
-+ src/libcec/CECClient.cpp | 17 ++++++++---------
-+ 1 file changed, 8 insertions(+), 9 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index 1946148..f4f114b 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -1131,6 +1131,8 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+   key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+   key.duration = 0;
-+ 
-++  if (m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN)
-++    return timeout;
-+   {
-+     CLockObject lock(m_mutex);
-+     uint64_t iNow = GetTimeMs();
-+@@ -1140,8 +1142,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+     uint32_t iTimeoutMs(m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5) ?
-+         m_configuration.iComboKeyTimeoutMs : CEC_DEFAULT_COMBO_TIMEOUT_MS);
-+ 
-+-    if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs)
-++    if (m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs)
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+@@ -1154,8 +1155,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+       m_pressedButtoncount = 0;
-+       m_releasedButtoncount = 0;
-+     }
-+-    else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)
-++    else if (m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+@@ -1168,8 +1168,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+       m_pressedButtoncount = 0;
-+       m_releasedButtoncount = 0;
-+     }
-+-    else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          (m_iCurrentButton != comboKey && m_repeatButtontime && iNow >= (uint64_t)m_repeatButtontime))
-++    else if (m_iCurrentButton != comboKey && m_repeatButtontime && iNow >= (uint64_t)m_repeatButtontime)
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+@@ -1178,11 +1177,11 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+     }
-+     else
-+     {
-+-      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton == comboKey && iTimeoutMs > 0)
-++      if (m_iCurrentButton == comboKey && iTimeoutMs > 0)
-+         timeout = std::min((uint64_t)timeout, m_updateButtontime - iNow + iTimeoutMs);
-+-      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey && m_releaseButtontime)
-++      if (m_iCurrentButton != comboKey && m_releaseButtontime)
-+         timeout = std::min((uint64_t)timeout, m_releaseButtontime - iNow);
-+-      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey && m_repeatButtontime)
-++      if (m_iCurrentButton != comboKey && m_repeatButtontime)
-+         timeout = std::min((uint64_t)timeout, m_repeatButtontime - iNow);
-+       if (timeout > CEC_PROCESSOR_SIGNAL_WAIT_TIME)
-+       {
-+-- 
-+1.9.1
-+
-
-From bfc97f9146e8ac70fb03c439a4cf1a9a3135ea9b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 28 Oct 2014 00:19:40 +0000
-Subject: [PATCH 31/93] [cec] Add settings for configuring button repeats
-
----
- addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
- system/peripherals.xml                              |  4 +++-
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp   | 16 ++++++++++++++++
- 3 files changed, 34 insertions(+), 1 deletion(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index cc486da..f9b8277 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -18177,3 +18177,18 @@ msgstr ""
- msgctxt "#38009"
- msgid "%i dB"
- msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38050"
-+msgid "Remote button press delay before repeating (ms)"
-+msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38051"
-+msgid "Remote button press repeat rate (ms)"
-+msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38052"
-+msgid "Remote button press release time (ms)"
-+msgstr ""
-diff --git a/system/peripherals.xml b/system/peripherals.xml
-index ec3c3fe..c3dbae0 100644
---- a/system/peripherals.xml
-+++ b/system/peripherals.xml
-@@ -31,7 +31,9 @@
-     <setting key="device_type" type="int" value="1" configurable="0" />
-     <setting key="wake_devices_advanced" type="string" value="" configurable="0" />
-     <setting key="standby_devices_advanced" type="string" value="" configurable="0" />
--    <setting key="double_tap_timeout_ms" type="int" min="0" value="300" configurable="0" />
-+    <setting key="double_tap_timeout_ms" type="int" min="50" max="1000" step="50" value="300" label="38050" order="16" />
-+    <setting key="button_repeat_rate_ms" type="int" min="0" max="250" step="10" value="0" label="38051" order="17" />
-+    <setting key="button_release_delay_ms" type="int" min="0" max="500" step="50" value="0" label="38052" order="18" />
-   </peripheral>
- 
-   <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index e0d8dae..f738c84 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -1278,6 +1278,20 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
-   m_configuration.bSendInactiveSource = config.bSendInactiveSource;
-   bChanged |= SetSetting("send_inactive_source", m_configuration.bSendInactiveSource == 1);
- 
-+#if defined(CEC_DOUBLE_TAP_TIMEOUT_MS_OLD)
-+  m_configuration.iDoubleTapTimeout50Ms = config.iDoubleTapTimeout50Ms;
-+  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeout50Ms * 50);
-+#else
-+  m_configuration.iDoubleTapTimeoutMs = config.iDoubleTapTimeoutMs;
-+  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeoutMs;
-+#endif
-+
-+  m_configuration.iButtonRepeatRateMs = config.iButtonRepeatRateMs;
-+  bChanged |= SetSetting("button_repeat_rate_ms", (int)m_configuration.iButtonRepeatRateMs);
-+
-+  m_configuration.iButtonReleaseDelayMs = config.iButtonReleaseDelayMs;
-+  bChanged |= SetSetting("button_release_delay_ms", (int)m_configuration.iButtonReleaseDelayMs);
-+
-   m_configuration.iFirmwareVersion = config.iFirmwareVersion;
-   m_configuration.bShutdownOnStandby = config.bShutdownOnStandby;
- 
-@@ -1382,6 +1396,8 @@ void CPeripheralCecAdapter::SetConfigurationFromSettings(void)
-   // backwards compatibility. will be removed once the next major release of libCEC is out
-   m_configuration.iDoubleTapTimeoutMs = GetSettingInt("double_tap_timeout_ms");
- #endif
-+  m_configuration.iButtonRepeatRateMs = GetSettingInt("button_repeat_rate_ms");
-+  m_configuration.iButtonReleaseDelayMs = GetSettingInt("button_release_delay_ms");
- 
-   if (GetSettingBool("pause_playback_on_deactivate"))
-   {
-
-From af63fad05fc2f6c24354c7acd08cd685ff376e28 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 3 Nov 2014 23:17:46 +0000
-Subject: [PATCH 32/93] [cec] Don't discard buttons when repeat mode is enabled
-
----
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index f738c84..58d7d0d 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -770,7 +770,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-   CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
- 
-   CSingleLock lock(m_critSection);
--  if (key.iDuration > 0)
-+  // avoid the queue getting too long
-+  if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
-+    return;
-+  if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
-   {
-     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
-     {
-
-From 42155d82d06a1deea72d4c3092315ea1110c6cb7 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 4 Nov 2014 18:50:00 +0000
-Subject: [PATCH 33/93] [cec] Temp - more logging
-
----
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 8 +++++++-
- 1 file changed, 7 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index 58d7d0d..dfba61a 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -767,12 +767,15 @@ void CPeripheralCecAdapter::GetNextKey(void)
- 
- void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
- {
--  CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
-+  CLog::Log(LOGDEBUG, "%s - received key %2x duration %d (rep:%d size:%d)", __FUNCTION__, key.iButton, key.iDuration, m_configuration.iButtonRepeatRateMs, m_buttonQueue.size());
- 
-   CSingleLock lock(m_critSection);
-   // avoid the queue getting too long
-   if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
-+  {
-+    CLog::Log(LOGDEBUG, "%s - discarded key %2x", __FUNCTION__, key.iButton);
-     return;
-+  }
-   if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
-   {
-     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
-@@ -781,6 +784,7 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-       if (m_bHasButton)
-         m_currentButton.iDuration = key.iDuration;
-       // ignore this one, since it's already been handled by xbmc
-+      CLog::Log(LOGDEBUG, "%s - ignored key %2x", __FUNCTION__, key.iButton);
-       return;
-     }
-     // if we received a keypress with a duration set, try to find the same one without a duration set, and replace it
-@@ -791,6 +795,7 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-         if ((*it).iDuration == 0)
-         {
-           // replace this entry
-+          CLog::Log(LOGDEBUG, "%s - replaced key %2x", __FUNCTION__, key.iButton);
-           (*it).iDuration = key.iDuration;
-           return;
-         }
-@@ -800,6 +805,7 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-     }
-   }
- 
-+  CLog::Log(LOGDEBUG, "%s - added key %2x", __FUNCTION__, key.iButton);
-   m_buttonQueue.push_back(key);
- }
- 
-
-From f8d6e97fedcb9184af7dfc8a976815892faa7784 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 15 Nov 2014 12:03:34 +0000
-Subject: [PATCH 34/93] [dvdplayer] Add lock for player creation
-
----
- xbmc/cores/dvdplayer/DVDPlayer.cpp | 3 +++
- xbmc/cores/dvdplayer/DVDPlayer.h   | 1 +
- 2 files changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayer.cpp b/xbmc/cores/dvdplayer/DVDPlayer.cpp
-index 4b09e8f..abcb8d2 100644
---- a/xbmc/cores/dvdplayer/DVDPlayer.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayer.cpp
-@@ -535,6 +535,7 @@ int CSelectionStreams::CountSource(StreamType type, StreamSource source) const
- 
- void CDVDPlayer::CreatePlayers()
- {
-+  CSingleLock lock(m_players_lock);
- #ifdef HAS_OMXPLAYER
-   bool omx_suitable = !OMXPlayerUnsuitable(m_HasVideo, m_HasAudio, m_pDemuxer, m_pInputStream, m_SelectionStreams);
-   if (m_omxplayer_mode != omx_suitable)
-@@ -566,6 +567,7 @@ void CDVDPlayer::CreatePlayers()
- 
- void CDVDPlayer::DestroyPlayers()
- {
-+  CSingleLock lock(m_players_lock);
-   if (!m_players_created)
-     return;
-   delete m_dvdPlayerVideo;
-@@ -4377,6 +4379,7 @@ double CDVDPlayer::GetQueueTime()
- 
- void CDVDPlayer::GetVideoStreamInfo(SPlayerVideoStreamInfo &info)
- {
-+  CSingleLock lock(m_players_lock);
-   info.bitrate = m_dvdPlayerVideo->GetVideoBitrate();
- 
-   std::string retVal;
-diff --git a/xbmc/cores/dvdplayer/DVDPlayer.h b/xbmc/cores/dvdplayer/DVDPlayer.h
-index 2f00647..b1418e3 100644
---- a/xbmc/cores/dvdplayer/DVDPlayer.h
-+++ b/xbmc/cores/dvdplayer/DVDPlayer.h
-@@ -567,4 +567,5 @@ protected:
-   // omxplayer variables
-   struct SOmxPlayerState m_OmxPlayerState;
-   bool m_omxplayer_mode;            // using omxplayer acceleration
-+  CCriticalSection m_players_lock;
- };
-
-From 2e80c975eb2d085f157ea328488aa7889c092f47 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 24 Nov 2014 22:07:25 +0000
-Subject: [PATCH 35/93] [dvdplayervideo] Prod decoder when in stills mode
-
-An asynchronous hardware decoder doesn't only produce output pictures when new packets arrive.
-In dvd stills mode give it a chance to return pictures that weren't ready when frame was decoded.
----
- xbmc/cores/dvdplayer/DVDPlayerVideo.cpp | 46 ++++++++++++++++++++-------------
- 1 file changed, 28 insertions(+), 18 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp b/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-index 379c541..b5777a1 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-@@ -313,7 +313,8 @@ void CDVDPlayerVideo::Process()
- 
-   while (!m_bStop)
-   {
--    int iQueueTimeOut = (int)(m_stalled ? frametime / 4 : frametime * 10) / 1000;
-+    bool bPictureWaiting = m_hints.stills && (m_pVideoCodec->Decode(NULL, 0, DVD_NOPTS_VALUE, DVD_NOPTS_VALUE) & VC_PICTURE);
-+    int iQueueTimeOut = (int)(bPictureWaiting ? 0 : (m_hints.stills || m_stalled) ? frametime / 4 : frametime * 10) / 1000;
-     int iPriority = (m_speed == DVD_PLAYSPEED_PAUSE && m_started) ? 1 : 0;
- 
-     CDVDMsg* pMsg;
-@@ -330,27 +331,36 @@ void CDVDPlayerVideo::Process()
-       if( iPriority )
-         continue;
- 
--      //Okey, start rendering at stream fps now instead, we are likely in a stillframe
--      if( !m_stalled )
-+      // check for picture waiting
-+      if (bPictureWaiting)
-       {
--        if(m_started)
--          CLog::Log(LOGINFO, "CDVDPlayerVideo - Stillframe detected, switching to forced %f fps", m_fFrameRate);
--        m_stalled = true;
--        pts+= frametime*4;
-+        // create a dummy demuxer packet to prod the decode logic
-+        pMsg = new CDVDMsgDemuxerPacket(CDVDDemuxUtils::AllocateDemuxPacket(0), false);
-       }
--
--      //Waiting timed out, output last picture
--      if( picture.iFlags & DVP_FLAG_ALLOCATED )
-+      else
-       {
--        //Remove interlaced flag before outputting
--        //no need to output this as if it was interlaced
--        picture.iFlags &= ~DVP_FLAG_INTERLACED;
--        picture.iFlags |= DVP_FLAG_NOSKIP;
--        OutputPicture(&picture, pts);
--        pts+= frametime;
--      }
-+        //Okey, start rendering at stream fps now instead, we are likely in a stillframe
-+        if( !m_stalled )
-+        {
-+          if(m_started)
-+            CLog::Log(LOGINFO, "CDVDPlayerVideo - Stillframe detected, switching to forced %f fps", m_fFrameRate);
-+          m_stalled = true;
-+          pts+= frametime*4;
-+        }
- 
--      continue;
-+        //Waiting timed out, output last picture
-+        if( picture.iFlags & DVP_FLAG_ALLOCATED )
-+        {
-+          //Remove interlaced flag before outputting
-+          //no need to output this as if it was interlaced
-+          picture.iFlags &= ~DVP_FLAG_INTERLACED;
-+          picture.iFlags |= DVP_FLAG_NOSKIP;
-+          OutputPicture(&picture, pts);
-+          pts+= frametime;
-+        }
-+
-+        continue;
-+      }
-     }
- 
-     if (pMsg->IsType(CDVDMsg::GENERAL_SYNCHRONIZE))
-
-From 1a4b613e9981829137c817baad127fda8e1e2823 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 27 Nov 2014 16:31:56 +0000
-Subject: [PATCH 36/93] [languageinvoker] Reduce priority of python threads
-
----
- xbmc/interfaces/generic/LanguageInvokerThread.cpp | 5 +++++
- 1 file changed, 5 insertions(+)
-
-diff --git a/xbmc/interfaces/generic/LanguageInvokerThread.cpp b/xbmc/interfaces/generic/LanguageInvokerThread.cpp
-index fcdd063..16f0c89 100644
---- a/xbmc/interfaces/generic/LanguageInvokerThread.cpp
-+++ b/xbmc/interfaces/generic/LanguageInvokerThread.cpp
-@@ -50,6 +50,11 @@ bool CLanguageInvokerThread::execute(const std::string &script, const std::vecto
-   m_args = arguments;
- 
-   Create();
-+  #ifdef TARGET_RASPBERRY_PI
-+  /* low prio */
-+  SetPriority(GetPriority()-1);
-+  #endif
-+
-   return true;
- }
- 
-
-From 73c6f413799cbb821f597253eb80457ee29a45f8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 14 Dec 2013 16:55:05 +0000
-Subject: [PATCH 37/93] logging: Add microsecond timer to log messages
-
----
- xbmc/utils/log.cpp | 17 +++++++++++++++--
- 1 file changed, 15 insertions(+), 2 deletions(-)
-
-diff --git a/xbmc/utils/log.cpp b/xbmc/utils/log.cpp
-index 3443f12..31c4a99 100644
---- a/xbmc/utils/log.cpp
-+++ b/xbmc/utils/log.cpp
-@@ -24,6 +24,7 @@
- #include "threads/Thread.h"
- #include "utils/StringUtils.h"
- #include "CompileInfo.h"
-+#include "utils/TimeUtils.cpp"
- 
- static const char* const levelNames[] =
- {"DEBUG", "INFO", "NOTICE", "WARNING", "ERROR", "SEVERE", "FATAL", "NONE"};
-@@ -198,19 +199,31 @@ void CLog::PrintDebugString(const std::string& line)
- 
- bool CLog::WriteLogString(int logLevel, const std::string& logString)
- {
-+#if defined(TARGET_LINUX)
-+  static const char* prefixFormat = "%02.2d:%02.2d:%02.2d %10.6f T:%" PRIu64" %7s: ";
-+#else
-   static const char* prefixFormat = "%02.2d:%02.2d:%02.2d T:%" PRIu64" %7s: ";
--
-+#endif
-   std::string strData(logString);
-   /* fixup newline alignment, number of spaces should equal prefix length */
-   StringUtils::Replace(strData, "\n", "\n                                            ");
- 
-   int hour, minute, second;
-   s_globals.m_platform.GetCurrentLocalTime(hour, minute, second);
--  
-+
-+#if defined(TARGET_LINUX)
-+  struct timespec now;
-+  clock_gettime(CLOCK_MONOTONIC, &now);
-+  float Now = now.tv_sec + now.tv_nsec * 1e-9;
-+#endif
-+
-   strData = StringUtils::Format(prefixFormat,
-                                   hour,
-                                   minute,
-                                   second,
-+#if defined(TARGET_LINUX)
-+                                  Now,
-+#endif
-                                   (uint64_t)CThread::GetCurrentThreadId(),
-                                   levelNames[logLevel]) + strData;
- 
-
-From dd959edaaae1f167e0979ac55d64e5d769127687 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 29 Nov 2014 15:25:16 +0000
-Subject: [PATCH 38/93] [rbp] hack: wait for splash to complete before changing
- hdmi mode
-
----
- xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp | 49 +++++++++++++++++++++++++
- 1 file changed, 49 insertions(+)
-
-diff --git a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-index ee29770..ff0d3e3 100644
---- a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-+++ b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-@@ -221,12 +221,61 @@ int CEGLNativeTypeRaspberryPI::AddUniqueResolution(RESOLUTION_INFO &res, std::ve
- }
- #endif
- 
-+#include <dirent.h>
-+
-+pid_t proc_find(const char* name)
-+{
-+    DIR* dir;
-+    struct dirent* ent;
-+    char buf[512];
-+
-+    long  pid;
-+    char pname[100] = {0,};
-+    char state;
-+    FILE *fp=NULL;
-+
-+    if (!(dir = opendir("/proc"))) {
-+        perror("can't open /proc");
-+        return -1;
-+    }
-+
-+    while((ent = readdir(dir)) != NULL) {
-+        long lpid = atol(ent->d_name);
-+        if(lpid < 0)
-+            continue;
-+        snprintf(buf, sizeof(buf), "/proc/%ld/stat", lpid);
-+        fp = fopen(buf, "r");
-+
-+        if (fp) {
-+            if ( (fscanf(fp, "%ld (%[^)]) %c", &pid, pname, &state)) != 3 ){
-+                printf("fscanf failed \n");
-+                fclose(fp);
-+                closedir(dir);
-+                return -1;
-+            }
-+            if (!strcmp(pname, name)) {
-+                fclose(fp);
-+                closedir(dir);
-+                return (pid_t)lpid;
-+            }
-+            fclose(fp);
-+        }
-+    }
-+
-+    closedir(dir);
-+    return -1;
-+}
-+
-+
- bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
- {
- #if defined(TARGET_RASPBERRY_PI)
-   if(!m_DllBcmHost || !m_nativeWindow)
-     return false;
- 
-+  while (proc_find("hello_video.bin") >= 0)
-+    Sleep(100);
-+
-   DestroyDispmaxWindow();
- 
-   RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
-
-From 3afc5e302cd1fdad4afa43ec705707f36de2ddaf Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 11 Dec 2014 17:00:57 +0000
-Subject: [PATCH 39/93] Fix for UI not showing both extractflags and
- extractthumb
-
----
- addons/resource.language.en_gb/resources/strings.po | 11 ++++++++---
- system/settings/settings.xml                        |  4 ++--
- 2 files changed, 10 insertions(+), 5 deletions(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index f9b8277..32314b2 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -11815,7 +11815,7 @@ msgstr ""
- 
- #: system/settings/settings.xml
- msgctxt "#20433"
--msgid "Extract thumbnails and video information"
-+msgid "Extract video information from files"
- msgstr ""
- 
- #: xbmc/dialogs/GUIDialogSmartPlaylistRule.cpp
-@@ -15987,7 +15987,7 @@ msgstr ""
- #. Description of setting "Videos -> File lists -> Extract thumbnails and video information" with label #20433
- #: system/settings/settings.xml
- msgctxt "#36178"
--msgid "Extract thumbnails and metadata information such as codec and aspect ratio from videos."
-+msgid "Extract metadata information such as codec and aspect ratio from videos."
- msgstr ""
- 
- #. Description of setting "Videos -> File lists -> Replace file names with library titles" with label #20419
-@@ -15999,7 +15999,7 @@ msgstr ""
- #. Description of setting "Videos -> File lists -> Extract thumbnails and video information" with label #20433
- #: system/settings/settings.xml
- msgctxt "#36180"
--msgid "Extract thumbnails and information, such as codecs and aspect ratio, to display in library mode."
-+msgid "Extract thumbnails, to display in library Mode."
- msgstr ""
- 
- #: system/settings/settings.xml
-@@ -18192,3 +18192,8 @@ msgstr ""
- msgctxt "#38052"
- msgid "Remote button press release time (ms)"
- msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38103"
-+msgid "Extract thumbnails from video files"
-+msgstr ""
-diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 918e8bf..61e1a22 100644
---- a/system/settings/settings.xml
-+++ b/system/settings/settings.xml
-@@ -885,8 +885,8 @@
-           <default>true</default>
-           <control type="toggle" />
-         </setting>
--        <setting id="myvideos.extractthumb" type="boolean" label="20433" help="36180">
--          <level>4</level>
-+        <setting id="myvideos.extractthumb" type="boolean" label="38103" help="36180">
-+          <level>1</level>
-           <default>true</default>
-           <control type="toggle" />
-         </setting>
-
-From c423d114818b5cd611bd83c31cda74139b5dfd91 Mon Sep 17 00:00:00 2001
-From: anaconda <anaconda@menakite.eu>
-Date: Thu, 11 Sep 2014 21:30:43 +0200
-Subject: [PATCH 40/93] Disable autoscrolling while on screensaver and while
- opening streams.
-
----
- xbmc/Application.cpp                | 10 ++++++++++
- xbmc/Application.h                  |  2 ++
- xbmc/guilib/GUIFadeLabelControl.cpp |  4 +++-
- xbmc/guilib/GUIFont.cpp             |  4 ++++
- xbmc/guilib/GUILabel.cpp            |  4 +++-
- xbmc/guilib/GUITextBox.cpp          |  3 ++-
- 6 files changed, 24 insertions(+), 3 deletions(-)
-
-diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index cb10ffa..c274e2f 100644
---- a/xbmc/Application.cpp
-+++ b/xbmc/Application.cpp
-@@ -5296,3 +5296,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
-   
-   return false;
- }
-+
-+bool CApplication::ScreenSaverDisablesAutoScrolling()
-+{
-+  bool onBlackDimScreenSaver = IsInScreenSaver() && m_screenSaver &&
-+    (m_screenSaver->ID() == "screensaver.xbmc.builtin.black" ||
-+     m_screenSaver->ID() == "screensaver.xbmc.builtin.dim");
-+  bool openingStreams = m_pPlayer->IsPlaying() && g_windowManager.IsWindowActive(WINDOW_DIALOG_BUSY);
-+
-+  return onBlackDimScreenSaver || openingStreams;
-+}
-diff --git a/xbmc/Application.h b/xbmc/Application.h
-index d7e5eee..a34ed98 100644
---- a/xbmc/Application.h
-+++ b/xbmc/Application.h
-@@ -390,6 +390,8 @@ public:
-    */
-   void UnregisterActionListener(IActionListener *listener);
- 
-+  bool ScreenSaverDisablesAutoScrolling();
-+
- protected:
-   virtual bool OnSettingsSaving() const override;
- 
-diff --git a/xbmc/guilib/GUIFadeLabelControl.cpp b/xbmc/guilib/GUIFadeLabelControl.cpp
-index ebd435e..97efc8a 100644
---- a/xbmc/guilib/GUIFadeLabelControl.cpp
-+++ b/xbmc/guilib/GUIFadeLabelControl.cpp
-@@ -20,6 +20,8 @@
- 
- #include "GUIFadeLabelControl.h"
- 
-+#include "Application.h"
-+
- CGUIFadeLabelControl::CGUIFadeLabelControl(int parentID, int controlID, float posX, float posY, float width, float height, const CLabelInfo& labelInfo, bool scrollOut, unsigned int timeToDelayAtEnd, bool resetOnLabelChange, bool randomized)
-     : CGUIControl(parentID, controlID, posX, posY, width, height), m_label(labelInfo), m_scrollInfo(50, labelInfo.offsetX, labelInfo.scrollSpeed)
-     , m_textLayout(labelInfo.font, false)
-@@ -105,7 +107,7 @@ void CGUIFadeLabelControl::Process(unsigned int currentTime, CDirtyRegionList &d
-     m_lastLabel = m_currentLabel;
-   }
- 
--  if (m_infoLabels.size() > 1 || !m_shortText)
-+  if ((m_infoLabels.size() > 1 || !m_shortText) && !g_application.ScreenSaverDisablesAutoScrolling())
-   { // have scrolling text
-     bool moveToNextLabel = false;
-     if (!m_scrollOut)
-diff --git a/xbmc/guilib/GUIFont.cpp b/xbmc/guilib/GUIFont.cpp
-index 7f11089..1192b74 100644
---- a/xbmc/guilib/GUIFont.cpp
-+++ b/xbmc/guilib/GUIFont.cpp
-@@ -22,6 +22,7 @@
- #include "GUIFontTTF.h"
- #include "GraphicContext.h"
- 
-+#include "Application.h"
- #include "threads/SingleLock.h"
- #include "utils/TimeUtils.h"
- #include "utils/MathUtils.h"
-@@ -128,6 +129,9 @@ bool CGUIFont::UpdateScrollInfo(const vecText &text, CScrollInfo &scrollInfo)
-   //   If the string is smaller than the viewport, then it may be plotted even
-   //   more times than that.
-   //
-+  if (g_application.ScreenSaverDisablesAutoScrolling())
-+    return false;
-+
-   if (scrollInfo.waitTime)
-   {
-     scrollInfo.waitTime--;
-diff --git a/xbmc/guilib/GUILabel.cpp b/xbmc/guilib/GUILabel.cpp
-index 759ac09..bed6ad2 100644
---- a/xbmc/guilib/GUILabel.cpp
-+++ b/xbmc/guilib/GUILabel.cpp
-@@ -21,6 +21,8 @@
- #include "GUILabel.h"
- #include <limits>
- 
-+#include "Application.h"
-+
- CGUILabel::CGUILabel(float posX, float posY, float width, float height, const CLabelInfo& labelInfo, CGUILabel::OVER_FLOW overflow)
-     : m_label(labelInfo)
-     , m_textLayout(labelInfo.font, overflow == OVER_FLOW_WRAP, height)
-@@ -104,7 +106,7 @@ void CGUILabel::Render()
-   color_t color = GetColor();
-   bool renderSolid = (m_color == COLOR_DISABLED);
-   bool overFlows = (m_renderRect.Width() + 0.5f < m_textLayout.GetTextWidth()); // 0.5f to deal with floating point rounding issues
--  if (overFlows && m_scrolling && !renderSolid)
-+  if (overFlows && m_scrolling && !renderSolid && !g_application.ScreenSaverDisablesAutoScrolling())
-     m_textLayout.RenderScrolling(m_renderRect.x1, m_renderRect.y1, m_label.angle, color, m_label.shadowColor, 0, m_renderRect.Width(), m_scrollInfo);
-   else
-   {
-diff --git a/xbmc/guilib/GUITextBox.cpp b/xbmc/guilib/GUITextBox.cpp
-index d7bc1c5..ac76629 100644
---- a/xbmc/guilib/GUITextBox.cpp
-+++ b/xbmc/guilib/GUITextBox.cpp
-@@ -24,6 +24,7 @@
- #include "utils/MathUtils.h"
- #include "utils/StringUtils.h"
- #include "guiinfo/GUIInfoLabels.h"
-+#include "Application.h"
- 
- #include <algorithm>
- 
-@@ -133,7 +134,7 @@ void CGUITextBox::Process(unsigned int currentTime, CDirtyRegionList &dirtyregio
-   // update our auto-scrolling as necessary
-   if (m_autoScrollTime && m_lines.size() > m_itemsPerPage)
-   {
--    if (!m_autoScrollCondition || m_autoScrollCondition->Get())
-+    if ((!m_autoScrollCondition || m_autoScrollCondition->Get()) && !g_application.ScreenSaverDisablesAutoScrolling())
-     {
-       if (m_lastRenderTime)
-         m_autoScrollDelayTime += currentTime - m_lastRenderTime;
-
-From 6b4fbcdd92b654b53fe8aeb5f00a5037117a505f Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 13 Dec 2014 18:35:20 +0000
-Subject: [PATCH 41/93] [demuxer] Avoid memcpy on every demuxer packet
-
-Avoids an unnecessary memcpy on every demuxer packet which for
-high bitrate videos can be significant.
----
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 17 ++++++++++++-----
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h   |  3 +++
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp  |  7 ++++++-
- 3 files changed, 21 insertions(+), 6 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-index 88d486b..47c15b9 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-@@ -753,7 +753,7 @@ DemuxPacket* CDVDDemuxFFmpeg::Read()
-           {
-             if(m_pkt.pkt.stream_index == (int)m_pFormatContext->programs[m_program]->stream_index[i])
-             {
--              pPacket = CDVDDemuxUtils::AllocateDemuxPacket(m_pkt.pkt.size);
-+              pPacket = CDVDDemuxUtils::AllocateDemuxPacket(0);
-               break;
-             }
-           }
-@@ -762,7 +762,7 @@ DemuxPacket* CDVDDemuxFFmpeg::Read()
-             bReturnEmpty = true;
-         }
-         else
--          pPacket = CDVDDemuxUtils::AllocateDemuxPacket(m_pkt.pkt.size);
-+          pPacket = CDVDDemuxUtils::AllocateDemuxPacket(0);
-       }
-       else
-         bReturnEmpty = true;
-@@ -804,9 +804,13 @@ DemuxPacket* CDVDDemuxFFmpeg::Read()
-         // copy contents into our own packet
-         pPacket->iSize = m_pkt.pkt.size;
- 
--        // maybe we can avoid a memcpy here by detecting where pkt.destruct is pointing too?
-         if (m_pkt.pkt.data)
--          memcpy(pPacket->pData, m_pkt.pkt.data, pPacket->iSize);
-+        {
-+          pPacket->pData = m_pkt.pkt.data;
-+          // so we can free AVPacket when DemuxPacket is freed
-+          pPacket->pkt = new AVPacket(m_pkt.pkt);
-+        }
-+
- 
-         pPacket->pts = ConvertTimestamp(m_pkt.pkt.pts, stream->time_base.den, stream->time_base.num);
-         pPacket->dts = ConvertTimestamp(m_pkt.pkt.dts, stream->time_base.den, stream->time_base.num);
-@@ -841,7 +845,10 @@ DemuxPacket* CDVDDemuxFFmpeg::Read()
-         pPacket->iStreamId = m_pkt.pkt.stream_index;
-       }
-       m_pkt.result = -1;
--      av_free_packet(&m_pkt.pkt);
-+      if (pPacket && pPacket->pkt)
-+        memset(&m_pkt.pkt, 0, sizeof(AVPacket));
-+      else
-+        av_free_packet(&m_pkt.pkt);
-     }
-   }
-   } // end of lock scope
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h
-index d64fbb3..012a7d1 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h
-@@ -23,6 +23,8 @@
- #define DMX_SPECIALID_STREAMINFO    -10
- #define DMX_SPECIALID_STREAMCHANGE  -11
- 
-+struct AVPacket;
-+
-  typedef struct DemuxPacket
- {
-   unsigned char* pData;   // data
-@@ -33,4 +35,5 @@
-   double pts; // pts in DVD_TIME_BASE
-   double dts; // dts in DVD_TIME_BASE
-   double duration; // duration in DVD_TIME_BASE if available
-+  AVPacket *pkt; // to allow packet to be freed
- } DemuxPacket;
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp
-index ab298b2..10c5ee0 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp
-@@ -34,7 +34,12 @@ void CDVDDemuxUtils::FreeDemuxPacket(DemuxPacket* pPacket)
-   if (pPacket)
-   {
-     try {
--      if (pPacket->pData) _aligned_free(pPacket->pData);
-+      if (pPacket->pkt)
-+      {
-+        av_free_packet(pPacket->pkt);
-+        delete pPacket->pkt;
-+      }
-+      else if (pPacket->pData) _aligned_free(pPacket->pData);
-       delete pPacket;
-     }
-     catch(...) {
-
-From 4e92f88d301118106a6aa08375bdd524fbbb0da8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 15 Feb 2015 14:06:12 +0000
-Subject: [PATCH 42/93] [mmal] Allow mmal codec for dvd stills
-
----
- xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp | 7 +++++++
- 1 file changed, 7 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
-index 84e9ef1..f920f49 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
-@@ -50,6 +50,9 @@
- #include "Video/DVDVideoCodecAndroidMediaCodec.h"
- #include "android/activity/AndroidFeatures.h"
- #endif
-+#if defined(HAS_MMAL)
-+#include "linux/RBP.h"
-+#endif
- #include "Audio/DVDAudioCodecFFmpeg.h"
- #include "Audio/DVDAudioCodecPassthrough.h"
- #include "Overlay/DVDOverlayCodecSSA.h"
-@@ -201,6 +204,10 @@ CDVDVideoCodec* CDVDFactoryCodec::CreateVideoCodec(CDVDStreamInfo &hint, const C
- #endif
-   CLog::Log(LOGDEBUG, "CDVDFactoryCodec: compiled in hardware support: %s", hwSupport.c_str());
- 
-+#if defined(HAS_MMAL)
-+  // mmal can handle dvd playback including stills
-+  if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || !g_RBP.GetCodecMpg2())
-+#endif
-   if (hint.stills && (hint.codec == AV_CODEC_ID_MPEG2VIDEO || hint.codec == AV_CODEC_ID_MPEG1VIDEO))
-   {
-      // If dvd is an mpeg2 and hint.stills
-
-From 6f7b1c2fa7e8b46895b2287b3a9361b85af7b210 Mon Sep 17 00:00:00 2001
-From: anaconda <anaconda@menakite.eu>
-Date: Wed, 25 Feb 2015 18:22:21 +0100
-Subject: [PATCH 43/93] Load OSD dialogs on startup.
-
-Fixes skipped frames the first time they're loaded in memory on less powered
-devices, like a Raspberry Pi, when using DVDPlayer.
-See http://forum.kodi.tv/showthread.php?tid=211501&pid=1938811#pid1938811
----
- xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp          | 1 +
- xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp             | 1 +
- xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp | 4 +++-
- xbmc/video/dialogs/GUIDialogSubtitles.cpp             | 2 +-
- xbmc/video/dialogs/GUIDialogVideoOSD.cpp              | 2 +-
- xbmc/video/dialogs/GUIDialogVideoSettings.cpp         | 4 +++-
- 6 files changed, 10 insertions(+), 4 deletions(-)
-
-diff --git a/xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp b/xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp
-index d7e8ac4..76b8c5a 100644
---- a/xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp
-+++ b/xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp
-@@ -50,6 +50,7 @@ CGUIDialogPVRChannelsOSD::CGUIDialogPVRChannelsOSD() :
-     CGUIDialog(WINDOW_DIALOG_PVR_OSD_CHANNELS, "DialogPVRChannelsOSD.xml"),
-     Observer()
- {
-+  m_loadType = LOAD_ON_GUI_INIT;
-   m_vecItems = new CFileItemList;
- }
- 
-diff --git a/xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp b/xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp
-index 81dbc27..0462310 100644
---- a/xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp
-+++ b/xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp
-@@ -36,6 +36,7 @@ using namespace PVR;
- CGUIDialogPVRGuideOSD::CGUIDialogPVRGuideOSD()
-     : CGUIDialog(WINDOW_DIALOG_PVR_OSD_GUIDE, "DialogPVRGuideOSD.xml")
- {
-+  m_loadType = LOAD_ON_GUI_INIT;
-   m_vecItems = new CFileItemList;
- }
- 
-diff --git a/xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp b/xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp
-index 8d57767..60b6461 100644
---- a/xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp
-+++ b/xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp
-@@ -68,7 +68,9 @@ CGUIDialogAudioSubtitleSettings::CGUIDialogAudioSubtitleSettings()
-   : CGUIDialogSettingsManualBase(WINDOW_DIALOG_AUDIO_OSD_SETTINGS, "VideoOSDSettings.xml"),
-     m_passthrough(false),
-     m_dspEnabled(false)
--{ }
-+{
-+  m_loadType = LOAD_ON_GUI_INIT;
-+}
- 
- CGUIDialogAudioSubtitleSettings::~CGUIDialogAudioSubtitleSettings()
- { }
-diff --git a/xbmc/video/dialogs/GUIDialogSubtitles.cpp b/xbmc/video/dialogs/GUIDialogSubtitles.cpp
-index 3db982a..dd35664 100644
---- a/xbmc/video/dialogs/GUIDialogSubtitles.cpp
-+++ b/xbmc/video/dialogs/GUIDialogSubtitles.cpp
-@@ -103,7 +103,7 @@ CGUIDialogSubtitles::CGUIDialogSubtitles(void)
-     , m_pausedOnRun(false)
-     , m_updateSubsList(false)
- {
--  m_loadType = KEEP_IN_MEMORY;
-+  m_loadType  = LOAD_ON_GUI_INIT;
- }
- 
- CGUIDialogSubtitles::~CGUIDialogSubtitles(void)
-diff --git a/xbmc/video/dialogs/GUIDialogVideoOSD.cpp b/xbmc/video/dialogs/GUIDialogVideoOSD.cpp
-index c1e99cf..5e3a31b 100644
---- a/xbmc/video/dialogs/GUIDialogVideoOSD.cpp
-+++ b/xbmc/video/dialogs/GUIDialogVideoOSD.cpp
-@@ -30,7 +30,7 @@ using namespace PVR;
- CGUIDialogVideoOSD::CGUIDialogVideoOSD(void)
-     : CGUIDialog(WINDOW_DIALOG_VIDEO_OSD, "VideoOSD.xml")
- {
--  m_loadType = KEEP_IN_MEMORY;
-+  m_loadType = LOAD_ON_GUI_INIT;
- }
- 
- CGUIDialogVideoOSD::~CGUIDialogVideoOSD(void)
-diff --git a/xbmc/video/dialogs/GUIDialogVideoSettings.cpp b/xbmc/video/dialogs/GUIDialogVideoSettings.cpp
-index b248566..96c63cd 100644
---- a/xbmc/video/dialogs/GUIDialogVideoSettings.cpp
-+++ b/xbmc/video/dialogs/GUIDialogVideoSettings.cpp
-@@ -66,7 +66,9 @@
- CGUIDialogVideoSettings::CGUIDialogVideoSettings()
-     : CGUIDialogSettingsManualBase(WINDOW_DIALOG_VIDEO_OSD_SETTINGS, "VideoOSDSettings.xml"),
-       m_viewModeChanged(false)
--{ }
-+{
-+  m_loadType = LOAD_ON_GUI_INIT;
-+}
- 
- CGUIDialogVideoSettings::~CGUIDialogVideoSettings()
- { }
-
-From 881432f2448626f24ea06cf02a29c811b075cdc8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 7 Mar 2015 22:46:21 +0000
-Subject: [PATCH 44/93] configure: Add raspberry-pi2 platform
-
----
- configure.ac                                      | 14 +++++++--
- m4/xbmc_arch.m4                                   |  8 ++---
- tools/depends/Makefile.include.in                 |  2 +-
- tools/depends/configure.ac                        | 38 ++++++++++++++++-------
- tools/depends/target/Toolchain.cmake.in           |  2 +-
- tools/depends/target/Toolchain_binaddons.cmake.in |  2 +-
- 6 files changed, 44 insertions(+), 22 deletions(-)
-
-diff --git a/configure.ac b/configure.ac
-index 55e73b9..7a06a31 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -698,8 +698,17 @@ case $use_platform in
-   raspberry-pi)
-      target_platform=target_raspberry_pi
-      use_neon=no
--     use_arch="arm"
-      use_cpu=arm1176jzf-s
-+     ;;
-+  raspberry-pi2)
-+     target_platform=target_raspberry_pi
-+     use_neon=yes
-+     use_cpu=cortex-a7
-+     ;;
-+esac
-+
-+if test "$target_platform" = "target_raspberry_pi" ; then
-+     use_arch="arm"
-      use_hardcoded_tables="yes"
-      use_openmax=no
-      ARCH="arm"
-@@ -708,8 +717,7 @@ case $use_platform in
-      USE_MMAL=1; AC_DEFINE([HAS_MMAL],[1],["Define to 1 if MMAL libs is enabled"])
-      CFLAGS="$CFLAGS"
-      CXXFLAGS="$CXXFLAGS"
--     ;;
--esac
-+fi
- 
- if test "$host_vendor" = "apple"; then
-   use_avahi=no
-diff --git a/m4/xbmc_arch.m4 b/m4/xbmc_arch.m4
-index 0b66a82..adb8e97 100644
---- a/m4/xbmc_arch.m4
-+++ b/m4/xbmc_arch.m4
-@@ -77,9 +77,7 @@ if test "$target_platform" = "target_android" ; then
-   AC_SUBST(ARCH_DEFINES, "-DTARGET_POSIX -DTARGET_LINUX -D_LINUX -DTARGET_ANDROID")
- fi
- 
--case $use_platform in
--  raspberry-pi)
--     AC_SUBST(ARCH_DEFINES, "-DTARGET_POSIX -DTARGET_LINUX -D_LINUX -D_ARMEL -DTARGET_RASPBERRY_PI")
--     ;;
--esac
-+if test "$target_platform" = "target_raspberry_pi" ; then
-+  AC_SUBST(ARCH_DEFINES, "-DTARGET_POSIX -DTARGET_LINUX -D_LINUX -D_ARMEL -DTARGET_RASPBERRY_PI")
-+fi
- ])
-diff --git a/tools/depends/Makefile.include.in b/tools/depends/Makefile.include.in
-index 6e37022..326e7b8 100644
---- a/tools/depends/Makefile.include.in
-+++ b/tools/depends/Makefile.include.in
-@@ -20,7 +20,7 @@ NATIVE_OS=@build_os@
- CROSS_COMPILING=@cross_compiling@
- ARCH_DEFINES=@ARCH_DEFINES@
- NATIVE_ARCH_DEFINES=@NATIVE_ARCH_DEFINES@
--TARGET_PLATFORM=@use_platform@
-+TARGET_PLATFORM=@target_platform@
- XCODE_VERSION=@use_xcode@
- AAPT=@AAPT@
- DX=@DX@
-diff --git a/tools/depends/configure.ac b/tools/depends/configure.ac
-index 12935e3..478f5f0 100644
---- a/tools/depends/configure.ac
-+++ b/tools/depends/configure.ac
-@@ -17,7 +17,8 @@ AC_ARG_WITH([toolchain],
- AC_ARG_WITH([platform],
-   [AS_HELP_STRING([--with-platform],
-   [target platform [auto]])],
--  [use_platform=$withval])
-+  [use_platform=$withval],
-+  [target_platform=$withval])
- 
- AC_ARG_WITH([firmware],
-   [AS_HELP_STRING([--with-firmware],
-@@ -302,34 +303,49 @@ case $host in
-     AC_MSG_ERROR(unsupported host ($use_host))
- esac
- 
--if test "$use_platform" = "raspberry-pi"; then
-+case $use_platform in
-+  raspberry-pi)
-+     target_platform=raspberry_pi
-+     use_neon=no
-+     use_cpu=arm1176jzf-s
-+     platform_cflags="-mcpu=arm1176jzf-s -mtune=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp"
-+     platform_cxxflags="-mcpu=arm1176jzf-s -mtune=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp"
-+     platform_ldflags=""
-+     ;;
-+  raspberry-pi2)
-+     target_platform=raspberry_pi
-+     use_neon=yes
-+     use_cpu=cortex-a7
-+     platform_cflags="-fPIC -mcpu=cortex-a7 -mfloat-abi=hard -mfpu=neon-vfpv4"
-+     platform_cxxflags="-fPIC -mcpu=cortex-a7 -mfloat-abi=hard -mfpu=neon-vfpv4"
-+     platform_ldflags="-lpthread"
-+     ;;
-+esac
-+
-+if test "$target_platform" = "raspberry_pi" ; then
-   if test -d "${use_firmware}/opt/vc/include"; then
-     :
-   else
-     AC_MSG_ERROR([Raspberry Pi firmware not found])
-   fi
--  use_neon=no
-   use_arch="arm"
--  use_cpu="arm1176jzf-s"
-   use_hardcoded_tables="yes"
--  use_alsa="no"
-   ARCH="arm"
-   platform_os="linux"
-   cross_compiling="yes"
-   use_host="arm-linux-gnueabihf"
-   deps_dir="$use_host"
--  platform_cflags="-pipe -mcpu=arm1176jzf-s -mtune=arm1176jzf-s -mfloat-abi=hard \
--   -mfpu=vfp -mabi=aapcs-linux -Wno-psabi -Wa,-mno-warn-deprecated \
--   -Wno-deprecated-declarations -isystem${use_firmware}/opt/vc/include \
-+  platform_cflags+=" -pipe -mabi=aapcs-linux -Wno-psabi \
-+   -Wa,-mno-warn-deprecated -Wno-deprecated-declarations \
-+   -isystem${use_firmware}/opt/vc/include \
-    -isystem${use_firmware}/opt/vc/include/interface/vcos/pthreads \
-    -isystem${use_firmware}/opt/vc/include/interface/vmcs_host/linux"
--  platform_cxxflags="-pipe -mcpu=arm1176jzf-s -mtune=arm1176jzf-s \
--   -mfloat-abi=hard -mfpu=vfp -mabi=aapcs-linux -Wno-psabi \
-+  platform_cxxflags+=" -pipe -mabi=aapcs-linux -Wno-psabi \
-    -Wa,-mno-warn-deprecated -Wno-deprecated-declarations \
-    -isystem${use_firmware}/opt/vc/include \
-    -isystem${use_firmware}/opt/vc/include/interface/vcos/pthreads \
-    -isystem${use_firmware}/opt/vc/include/interface/vmcs_host/linux"
--  platform_ldflags="-L${use_firmware}/opt/vc/lib -lEGL -lGLESv2 -lbcm_host -lvcos \
-+  platform_ldflags+=" -L${use_firmware}/opt/vc/lib -lEGL -lGLESv2 -lbcm_host -lvcos \
-    -lvchiq_arm"
- fi
- 
-diff --git a/tools/depends/target/Toolchain.cmake.in b/tools/depends/target/Toolchain.cmake.in
-index 943be73..59385e8 100644
---- a/tools/depends/target/Toolchain.cmake.in
-+++ b/tools/depends/target/Toolchain.cmake.in
-@@ -1,6 +1,6 @@
- SET(OS "@platform_os@")
- SET(CPU "@use_cpu@")
--SET(PLATFORM "@use_platform@")
-+SET(PLATFORM "@target_platform@")
- IF("${OS}" STREQUAL "linux" OR "${OS}" STREQUAL "android")
- SET(CMAKE_SYSTEM_NAME Linux)
- ENDIF()
-diff --git a/tools/depends/target/Toolchain_binaddons.cmake.in b/tools/depends/target/Toolchain_binaddons.cmake.in
-index dc6d565..98494b4 100644
---- a/tools/depends/target/Toolchain_binaddons.cmake.in
-+++ b/tools/depends/target/Toolchain_binaddons.cmake.in
-@@ -1,7 +1,7 @@
- set(CMAKE_SYSTEM_VERSION 1)
- set(OS "@platform_os@")
- set(CPU "@use_cpu@")
--set(PLATFORM "@use_platform@")
-+set(PLATFORM "@target_platform@")
- if("${OS}" STREQUAL "linux" OR "${OS}" STREQUAL "android")
-   set(CMAKE_SYSTEM_NAME Linux)
- endif()
-
-From 555c3d2ed48c00e6ef8632d47db58cab4d53b78b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 14 Apr 2015 20:51:14 +0100
-Subject: [PATCH 45/93] [gui] Also limit GUI updates when in non full-screen
- video mode
-
----
- xbmc/Application.cpp | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index c274e2f..212a5c7 100644
---- a/xbmc/Application.cpp
-+++ b/xbmc/Application.cpp
-@@ -2834,7 +2834,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
- #if defined(TARGET_RASPBERRY_PI) || defined(HAS_IMXVPU)
-     // This code reduces rendering fps of the GUI layer when playing videos in fullscreen mode
-     // it makes only sense on architectures with multiple layers
--    if (g_graphicsContext.IsFullScreenVideo() && !m_pPlayer->IsPausedPlayback() && g_renderManager.IsVideoLayer())
-+    if (m_pPlayer->IsPlayingVideo() && !m_pPlayer->IsPausedPlayback() && g_renderManager.IsVideoLayer())
-       fps = CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_LIMITGUIUPDATE);
- #endif
- 
-@@ -2847,6 +2847,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
-     {
-       if (!m_skipGuiRender)
-         g_windowManager.Process(CTimeUtils::GetFrameTime());
-+      else if (!g_graphicsContext.IsFullScreenVideo())
-+        g_renderManager.FrameMove();
-     }
-     g_windowManager.FrameMove();
-   }
-
-From 67b90947ab8fb7fe16d39597f285a7e08fabc5b8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 21 Apr 2015 14:32:07 +0100
-Subject: [PATCH 46/93] [mmalrenderer] Add sharpness control
-
----
- addons/resource.language.en_gb/resources/strings.po |  2 +-
- xbmc/cores/VideoRenderers/MMALRenderer.cpp          | 13 ++++++++++++-
- xbmc/cores/VideoRenderers/MMALRenderer.h            |  1 +
- 3 files changed, 14 insertions(+), 2 deletions(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 32314b2..a1da64b 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -8229,7 +8229,7 @@ msgstr ""
- 
- #: xbmc/video/dialogs/GUIDialogVideoSettings.cpp
- msgctxt "#16313"
--msgid "VDPAU - Sharpness"
-+msgid "Sharpness"
- msgstr ""
- 
- #: xbmc/video/dialogs/GUIDialogVideoSettings.cpp
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index 2dff194..adf6f73 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -252,6 +252,7 @@ CMMALRenderer::CMMALRenderer()
-   m_bMMALConfigured = false;
-   m_iYV12RenderBuffer = 0;
-   m_inflight = 0;
-+  m_sharpness = -2.0f;
- }
- 
- CMMALRenderer::~CMMALRenderer()
-@@ -459,6 +460,15 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-   if (m_RenderUpdateCallBackFn)
-     (*m_RenderUpdateCallBackFn)(m_RenderUpdateCallBackCtx, m_sourceRect, m_destRect);
- 
-+  // if sharpness setting has changed, we should update it
-+  if (m_sharpness != CMediaSettings::GetInstance().GetCurrentVideoSettings().m_Sharpness)
-+  {
-+    m_sharpness = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_Sharpness;
-+    char command[80], response[80];
-+    sprintf(command, "scaling_sharpness %d", ((int)(50.0f * (m_sharpness + 1.0f) + 0.5f)));
-+    vc_gencmd(response, sizeof response, command);
-+  }
-+
-   if (m_format == RENDER_FMT_BYPASS)
-   {
- #if defined(MMAL_DEBUG_VERBOSE)
-@@ -669,7 +679,8 @@ bool CMMALRenderer::Supports(ERENDERFEATURE feature)
-       feature == RENDERFEATURE_ZOOM            ||
-       feature == RENDERFEATURE_ROTATION        ||
-       feature == RENDERFEATURE_VERTICAL_SHIFT  ||
--      feature == RENDERFEATURE_PIXEL_RATIO)
-+      feature == RENDERFEATURE_PIXEL_RATIO     ||
-+      feature == RENDERFEATURE_SHARPNESS)
-     return true;
- 
-   return false;
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.h b/xbmc/cores/VideoRenderers/MMALRenderer.h
-index d3e5129..a71e645 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.h
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.h
-@@ -119,6 +119,7 @@ protected:
-   RENDER_STEREO_MODE        m_display_stereo_mode;
-   bool                      m_StereoInvert;
-   int                       m_inflight;
-+  float                     m_sharpness;
- 
-   CCriticalSection m_sharedSection;
-   MMAL_COMPONENT_T *m_vout;
-
-From 2ce900e2ef03fae1215700b5a839276585a00c92 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 24 Apr 2015 13:49:51 +0100
-Subject: [PATCH 47/93] [dvdplayer] Add back required include
-
----
- xbmc/cores/dvdplayer/DVDPlayerVideo.cpp | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp b/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-index b5777a1..64b4d60 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-@@ -33,6 +33,7 @@
- #include "DVDCodecs/Video/DVDVideoPPFFmpeg.h"
- #include "DVDCodecs/Video/DVDVideoCodecFFmpeg.h"
- #include "DVDDemuxers/DVDDemux.h"
-+#include "DVDDemuxers/DVDDemuxUtils.h"
- #include "DVDOverlayRenderer.h"
- #include "guilib/GraphicContext.h"
- #include <sstream>
-
-From dc5e83b0cfbec04a34b3b8ea7fca8bbbcaae1f2c Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 5 May 2015 23:58:06 +0100
-Subject: [PATCH 48/93] [screensaver] Leave GUI contents available for
- screensaver
-
----
- xbmc/guilib/GUIWindowManager.cpp | 11 ++++++++++-
- 1 file changed, 10 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/guilib/GUIWindowManager.cpp b/xbmc/guilib/GUIWindowManager.cpp
-index 89cfb8b..767c7b6 100644
---- a/xbmc/guilib/GUIWindowManager.cpp
-+++ b/xbmc/guilib/GUIWindowManager.cpp
-@@ -786,7 +786,16 @@ void CGUIWindowManager::ActivateWindow_Internal(int iWindowID, const std::vector
-   int currentWindow = GetActiveWindow();
-   CGUIWindow *pWindow = GetWindow(currentWindow);
-   if (pWindow)
--    CloseWindowSync(pWindow, iWindowID);
-+  {
-+    if (iWindowID == WINDOW_SCREENSAVER)
-+    {
-+      pWindow->Close(true, iWindowID);
-+    }
-+    else
-+    {
-+      CloseWindowSync(pWindow, iWindowID);
-+    }
-+  }
-   g_infoManager.SetNextWindow(WINDOW_INVALID);
- 
-   // Add window to the history list (we must do this before we activate it,
-
-From 249d8d5147b3124129255deaa216da316cb8732e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 6 Jun 2015 18:43:57 +0100
-Subject: [PATCH 49/93] ffmpeg: Automatic switch to software decode for GMC
- with more than one warp point
-
----
- ...Signal-unsupported-GMC-with-more-than-one.patch | 48 ++++++++++++++++++++++
- tools/depends/target/ffmpeg/Makefile               |  4 +-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp |  6 +++
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h        |  2 +
- .../cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp |  2 +-
- xbmc/cores/dvdplayer/DVDStreamInfo.cpp             |  3 ++
- xbmc/cores/dvdplayer/DVDStreamInfo.h               |  1 +
- xbmc/cores/omxplayer/OMXHelper.cpp                 |  8 +++-
- 8 files changed, 71 insertions(+), 3 deletions(-)
- create mode 100644 tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-
-diff --git a/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-new file mode 100644
-index 0000000..4cb8dd8
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-@@ -0,0 +1,48 @@
-+From 84e9a1784bbd3182b68cefa5e5feae8da8b9e184 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Fri, 5 Jun 2015 22:48:33 +0100
-+Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp
-+ point
-+
-+---
-+ libavcodec/avcodec.h       | 1 +
-+ libavcodec/mpeg4videodec.c | 4 ++++
-+ 2 files changed, 5 insertions(+)
-+
-+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-+index 8c7c420..e63dc2d 100644
-+--- a/libavcodec/avcodec.h
-++++ b/libavcodec/avcodec.h
-+@@ -2527,6 +2527,7 @@ typedef struct AVCodecContext {
-+ #define FF_BUG_DC_CLIP          4096
-+ #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
-+ #define FF_BUG_TRUNCATED       16384
-++#define FF_BUG_GMC_UNSUPPORTED 32768
-+ 
-+     /**
-+      * strictly follow the standard (MPEG4, ...).
-+diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-+index 9bf33dd..0b5d3b9 100644
-+--- a/libavcodec/mpeg4videodec.c
-++++ b/libavcodec/mpeg4videodec.c
-+@@ -2179,6 +2179,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-+ 
-+         if (ctx->divx_version >= 0)
-+             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
-++
-++        if (ctx->num_sprite_warping_points > 1)
-++            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
-+     }
-+ 
-+     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-+@@ -2203,6 +2206,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-+                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-+                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
-+ 
-++    avctx->workaround_bugs = s->workaround_bugs;
-+     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
-+         s->codec_id == AV_CODEC_ID_MPEG4 &&
-+         avctx->idct_algo == FF_IDCT_AUTO) {
-+-- 
-+1.9.1
-+
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index fcfc553..6a9f105 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -1,6 +1,7 @@
- include ../../Makefile.include
- include FFMPEG-VERSION
--DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch
-+DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch \
-+  0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -71,6 +72,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
- 	cd $(PLATFORM); sed -i".bak" -e "s%pkg_config_default=pkg-config%export PKG_CONFIG_LIBDIR=$(PREFIX)/lib/pkgconfig \&\& pkg_config_default=$(NATIVEPREFIX)/bin/pkg-config%" configure
- 	cd $(PLATFORM); patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
-+	cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
- 	./configure $(ffmpg_config)
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index c09074d..3345685 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -46,6 +46,10 @@
- 
- #include "linux/RBP.h"
- 
-+#ifndef FF_BUG_GMC_UNSUPPORTED
-+#define FF_BUG_GMC_UNSUPPORTED 0
-+#endif
-+
- using namespace KODI::MESSAGING;
- 
- #define CLASSNAME "CMMALVideoBuffer"
-@@ -531,6 +535,8 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   // we always qualify even if DVDFactoryCodec does this too.
-   if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
-     return false;
-+  if (hints.workaround_bugs & FF_BUG_GMC_UNSUPPORTED)
-+    return false;
- 
-   m_hints = hints;
-   m_vout_input_pool = (MMAL_POOL_T *)options.m_opaque_pointer;
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-index e380056..122e539 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-@@ -160,6 +160,7 @@ public:
-     type = STREAM_VIDEO;
-     iOrientation = 0;
-     iBitsPerPixel = 0;
-+    workaround_bugs = 0;
-   }
- 
-   virtual ~CDemuxStreamVideo() {}
-@@ -176,6 +177,7 @@ public:
-   int iOrientation; // orientation of the video in degress counter clockwise
-   int iBitsPerPixel;
-   std::string stereo_mode; // expected stereo mode
-+  int workaround_bugs; // info for decoder
- };
- 
- class CDemuxStreamAudio : public CDemuxStream
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-index 47c15b9..56dcbfb 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-@@ -1215,7 +1215,7 @@ CDemuxStream* CDVDDemuxFFmpeg::AddStream(int iId)
-         if (!stereoMode.empty())
-           st->stereo_mode = stereoMode;
- 
--        
-+        st->workaround_bugs = pStream->codec->workaround_bugs;
-         if ( m_pInput->IsStreamType(DVDSTREAM_TYPE_DVD) )
-         {
-           if (pStream->codec->codec_id == AV_CODEC_ID_PROBE)
-diff --git a/xbmc/cores/dvdplayer/DVDStreamInfo.cpp b/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-index fa0defa..37c2d16 100644
---- a/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-+++ b/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-@@ -73,6 +73,7 @@ void CDVDStreamInfo::Clear()
-   bitspersample = 0;
- 
-   orientation = 0;
-+  workaround_bugs = 0;
- }
- 
- bool CDVDStreamInfo::Equal(const CDVDStreamInfo& right, bool withextradata)
-@@ -174,6 +175,7 @@ void CDVDStreamInfo::Assign(const CDVDStreamInfo& right, bool withextradata)
-   vfr = right.vfr;
-   software = right.software;
-   stereo_mode = right.stereo_mode;
-+  workaround_bugs = right.workaround_bugs;
- 
-   // AUDIO
-   channels      = right.channels;
-@@ -231,6 +233,7 @@ void CDVDStreamInfo::Assign(const CDemuxStream& right, bool withextradata)
-     bitsperpixel = stream->iBitsPerPixel;
-     pid = stream->iPhysicalId;
-     stereo_mode = stream->stereo_mode;
-+    workaround_bugs = stream->workaround_bugs;
-   }
-   else if(  right.type == STREAM_SUBTITLE )
-   {
-diff --git a/xbmc/cores/dvdplayer/DVDStreamInfo.h b/xbmc/cores/dvdplayer/DVDStreamInfo.h
-index c0e22a2..3849993 100644
---- a/xbmc/cores/dvdplayer/DVDStreamInfo.h
-+++ b/xbmc/cores/dvdplayer/DVDStreamInfo.h
-@@ -73,6 +73,7 @@ public:
-   int bitsperpixel;
-   int pid;
-   std::string stereo_mode; // stereoscopic 3d mode
-+  int workaround_bugs; // info for decoder
- 
-   // AUDIO
-   int channels;
-diff --git a/xbmc/cores/omxplayer/OMXHelper.cpp b/xbmc/cores/omxplayer/OMXHelper.cpp
-index 7251fc1..3429cea 100644
---- a/xbmc/cores/omxplayer/OMXHelper.cpp
-+++ b/xbmc/cores/omxplayer/OMXHelper.cpp
-@@ -29,6 +29,10 @@
- #include "cores/omxplayer/OMXPlayerAudio.h"
- #include "cores/omxplayer/OMXPlayerVideo.h"
- 
-+#ifndef FF_BUG_GMC_UNSUPPORTED
-+#define FF_BUG_GMC_UNSUPPORTED 0
-+#endif
-+
- #define PREDICATE_RETURN(lh, rh) \
-   do { \
-     if((lh) != (rh)) \
-@@ -80,7 +84,9 @@ bool OMXPlayerUnsuitable(bool m_HasVideo, bool m_HasAudio, CDVDDemux* m_pDemuxer
-       CDVDStreamInfo hint(*stream, true);
- 
-       bool supported = false;
--      if ((hint.codec == AV_CODEC_ID_MPEG1VIDEO || hint.codec == AV_CODEC_ID_MPEG2VIDEO) && g_RBP.GetCodecMpg2())
-+      if (hint.workaround_bugs & FF_BUG_GMC_UNSUPPORTED)
-+        ;
-+      else if ((hint.codec == AV_CODEC_ID_MPEG1VIDEO || hint.codec == AV_CODEC_ID_MPEG2VIDEO) && g_RBP.GetCodecMpg2())
-         supported = true;
-       else if ((hint.codec == AV_CODEC_ID_VC1 || hint.codec == AV_CODEC_ID_WMV3) && g_RBP.GetCodecWvc1())
-         supported = true;
-
-From eb80abbf4ad8994a28d58ea8494e8a7bcd48b2f3 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 22 Jun 2015 21:46:57 +0100
-Subject: [PATCH 50/93] [rbp] Use default resampling setting on Pi2
-
----
- system/settings/rbp2.xml | 5 +++++
- 1 file changed, 5 insertions(+)
-
-diff --git a/system/settings/rbp2.xml b/system/settings/rbp2.xml
-index 8cc8f19..52778ec 100644
---- a/system/settings/rbp2.xml
-+++ b/system/settings/rbp2.xml
-@@ -23,6 +23,11 @@
-         <setting id="audiooutput.ac3transcode" help="36429">
-         </setting>
-       </group>
-+      <group id="1">
-+        <setting id="audiooutput.processquality">
-+          <default>30</default> <!-- AE_QUALITY_MID -->
-+        </setting>
-+      </group>
-     </category>
-   </section>
- </settings>
-
-From 822ce9d64325082d7b071b68331c8fbd406d2ee1 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 10 Mar 2016 17:54:59 +0000
-Subject: [PATCH 51/93] rbp: Expose gpu memory allocation functions
-
----
- xbmc/linux/RBP.cpp         | 116 ++++++++++++
- xbmc/linux/RBP.h           |  16 ++
- xbmc/linux/rpi_user_vcsm.h | 460 +++++++++++++++++++++++++++++++++++++++++++++
- 3 files changed, 592 insertions(+)
- create mode 100644 xbmc/linux/rpi_user_vcsm.h
-
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index a79d6d9..257c238 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -28,6 +28,17 @@
- 
- #include "cores/omxplayer/OMXImage.h"
- 
-+#include <sys/ioctl.h>
-+#include <linux/ioctl.h>
-+#include "rpi_user_vcsm.h"
-+
-+#define MAJOR_NUM 100
-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+#define DEVICE_FILE_NAME "/dev/vcio"
-+
-+static int mbox_open();
-+static void mbox_close(int file_desc);
-+
- CRBP::CRBP()
- {
-   m_initialized     = false;
-@@ -36,6 +47,8 @@ CRBP::CRBP()
-   m_OMX             = new COMXCore();
-   m_display = DISPMANX_NO_HANDLE;
-   m_last_pll_adjust = 1.0;
-+  m_mb = mbox_open();
-+  vcsm_init();
- }
- 
- CRBP::~CRBP()
-@@ -225,6 +238,10 @@ void CRBP::Deinitialize()
-   m_omx_image_init  = false;
-   m_initialized     = false;
-   m_omx_initialized = false;
-+  if (m_mb)
-+    mbox_close(m_mb);
-+  m_mb = 0;
-+  vcsm_exit();
- }
- 
- double CRBP::AdjustHDMIClock(double adjust)
-@@ -238,4 +255,103 @@ double CRBP::AdjustHDMIClock(double adjust)
-   return m_last_pll_adjust;
- }
- 
-+static int mbox_property(int file_desc, void *buf)
-+{
-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-+
-+   if (ret_val < 0)
-+   {
-+     CLog::Log(LOGERROR, "%s: ioctl_set_msg failed:%d", __FUNCTION__, ret_val);
-+   }
-+   return ret_val;
-+}
-+
-+static int mbox_open()
-+{
-+   int file_desc;
-+
-+   // open a char device file used for communicating with kernel mbox driver
-+   file_desc = open(DEVICE_FILE_NAME, 0);
-+   if (file_desc < 0)
-+   {
-+     CLog::Log(LOGERROR, "%s: Can't open device file: %s (%d)", __FUNCTION__, DEVICE_FILE_NAME, file_desc);
-+     CLog::Log(LOGERROR, "Try creating a device file with: sudo mknod %s c %d 0", __FUNCTION__, DEVICE_FILE_NAME, MAJOR_NUM);
-+   }
-+   return file_desc;
-+}
-+
-+static void mbox_close(int file_desc)
-+{
-+  close(file_desc);
-+}
-+
-+static unsigned mem_lock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000d; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_unlock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000e; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+CGPUMEM::CGPUMEM(unsigned int numbytes, bool cached)
-+{
-+  m_numbytes = numbytes;
-+  m_vcsm_handle = vcsm_malloc_cache(numbytes, cached ? VCSM_CACHE_TYPE_HOST : VCSM_CACHE_TYPE_NONE, (char *)"CGPUMEM");
-+  assert(m_vcsm_handle);
-+  m_vc_handle = vcsm_vc_hdl_from_hdl(m_vcsm_handle);
-+  assert(m_vc_handle);
-+  m_arm = vcsm_lock(m_vcsm_handle);
-+  assert(m_arm);
-+  m_vc = mem_lock(g_RBP.GetMBox(), m_vc_handle);
-+  assert(m_vc);
-+}
-+
-+CGPUMEM::~CGPUMEM()
-+{
-+  mem_unlock(g_RBP.GetMBox(), m_vc_handle);
-+  vcsm_unlock_ptr(m_arm);
-+  vcsm_free(m_vcsm_handle);
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void CGPUMEM::Flush()
-+{
-+  struct vcsm_user_clean_invalid_s iocache = {};
-+  iocache.s[0].handle = m_vcsm_handle;
-+  iocache.s[0].cmd = 3; // clean+invalidate
-+  iocache.s[0].addr = (int) m_arm;
-+  iocache.s[0].size  = m_numbytes;
-+  vcsm_clean_invalid( &iocache );
-+}
-+
- #endif
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index db2fade..ab24bbe 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -41,6 +41,20 @@
- #include "threads/CriticalSection.h"
- #include "threads/Event.h"
- 
-+class CGPUMEM
-+{
-+public:
-+  CGPUMEM(unsigned int numbytes, bool cached = true);
-+  ~CGPUMEM();
-+  void Flush();
-+  void *m_arm; // Pointer to memory mapped on ARM side
-+  int m_vc_handle;   // Videocore handle of relocatable memory
-+  int m_vcsm_handle; // Handle for use by VCSM
-+  unsigned int m_vc;       // Address for use in GPU code
-+  unsigned int m_numbytes; // Size of memory block
-+  void *m_opaque;
-+};
-+
- class CRBP
- {
- public:
-@@ -66,6 +80,7 @@ public:
-   void WaitVsync();
-   double AdjustHDMIClock(double adjust);
-   double GetAdjustHDMIClock() { return m_last_pll_adjust; }
-+  int GetMBox() { return m_mb; }
- 
- private:
-   DllBcmHost *m_DllBcmHost;
-@@ -83,6 +98,7 @@ private:
-   class DllLibOMXCore;
-   CCriticalSection m_critSection;
-   double m_last_pll_adjust;
-+  int m_mb;
- };
- 
- extern CRBP g_RBP;
-diff --git a/xbmc/linux/rpi_user_vcsm.h b/xbmc/linux/rpi_user_vcsm.h
-new file mode 100644
-index 0000000..94e6e79
---- /dev/null
-+++ b/xbmc/linux/rpi_user_vcsm.h
-@@ -0,0 +1,460 @@
-+/*****************************************************************************
-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
-+*
-+* This program is the proprietary software of Broadcom Corporation and/or
-+* its licensors, and may only be used, duplicated, modified or distributed
-+* pursuant to the terms and conditions of a separate, written license
-+* agreement executed between you and Broadcom (an "Authorized License").
-+* Except as set forth in an Authorized License, Broadcom grants no license
-+* (express or implied), right to use, or waiver of any kind with respect to
-+* the Software, and Broadcom expressly reserves all rights in and to the
-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-+* THE SOFTWARE.
-+*
-+* Except as expressly set forth in the Authorized License,
-+* 1. This program, including its structure, sequence and organization,
-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
-+*    all reasonable efforts to protect the confidentiality thereof, and to
-+*    use this information only in connection with your use of Broadcom
-+*    integrated circuit products.
-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-+*****************************************************************************/
-+
-+#ifndef __USER_VCSM__H__INCLUDED__
-+#define __USER_VCSM__H__INCLUDED__
-+
-+/* VideoCore Shared Memory - user interface library.
-+**
-+** This library provides all the necessary abstraction for any application to
-+** make use of the shared memory service which is distributed accross a kernel
-+** driver and a videocore service.
-+**
-+** It is an application design decision to choose or not to use this service.
-+**
-+** The logical flow of operations that a user application needs to follow when
-+** using this service is:
-+**
-+**       1) Initialize the service.
-+**       2) Allocate shared memory blocks.
-+**       3) Start using the allocated blocks.
-+**          - In order to gain ownership on a block, lock the allocated block,
-+**            locking a block returns a valid address that the user application
-+**            can access.
-+**          - When finished with using the block for the current execution cycle
-+**            or function, and so when giving up the ownership, unlock the block.
-+**       4) A block can be locked/unlocked as many times required - within or outside
-+**          of - a specific execution context.
-+**       5) To completely release an allocated block, free it.
-+**       6) If the service is no longer required, terminate it.
-+**
-+**
-+** Some generic considerations:
-+
-+** Allocating memory blocks.
-+**
-+**   Memory blocks can be allocated in different manners depending on the cache
-+**   behavior desired.  A given block can either be:
-+
-+**       - Allocated in a non cached fashion all the way through host and videocore.
-+**       - Allocated in a cached fashion on host OR videocore.
-+**       - Allocated in a cached fashion on host AND videocore.
-+**
-+**   It is an application decision to determine how to allocate a block.  Evidently
-+**   if the application will be doing substantial read/write accesses to a given block,
-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
-+**   better results.
-+**
-+**
-+** Locking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, locking the
-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**   It is possible to dynamically change the host cache behavior (ie cached or non
-+**   cached) of a given allocation without needing to free and re-allocate the block.
-+**   This feature can be useful for such application which requires access to the block
-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
-+**   the application can optimize performances for a given duration of use.
-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
-+**   cache.  If one requires to change the videocore cache behavior, then a new block
-+**   must be created to replace the old one.
-+**
-+**   On successful locking, a valid pointer is returned that the application can use
-+**   to access to data inside the block.  There is no guarantee that the pointer will
-+**   stay valid following the unlock action corresponding to this lock.
-+**
-+**
-+** Unocking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, unlocking the
-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
-+**   explicitely asked not to flush the cache for performances reasons.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**
-+** A complete API is defined below.
-+*/
-+
-+#ifdef __cplusplus
-+extern "C"
-+{
-+#endif
-+
-+/* Different status that can be dumped.
-+*/
-+typedef enum
-+{
-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
-+                                    // Result of the walk is seen in the videocore
-+                                    // log.
-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
-+                                    // driver (ie for all processes).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-+                                    // VCSM_STATUS_HOST_WALK_MAP.
-+                                    //
-+   VCSM_STATUS_NONE,                // Must be last - invalid.
-+
-+} VCSM_STATUS_T;
-+
-+/* Different kind of cache behavior.
-+*/
-+typedef enum
-+{
-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
-+
-+} VCSM_CACHE_TYPE_T;
-+
-+/* Initialize the vcsm processing.
-+**
-+** Must be called once before attempting to do anything else.
-+**
-+** Returns 0 on success, -1 on error.
-+*/
-+int vcsm_init( void );
-+
-+
-+/* Terminates the vcsm processing.
-+**
-+** Must be called vcsm services are no longer needed, it will
-+** take care of removing any allocation under the current process
-+** control if deemed necessary.
-+*/
-+void vcsm_exit( void );
-+
-+
-+/* Queries the status of the the vcsm.
-+**
-+** Triggers dump of various kind of information, see the
-+** different variants specified in VCSM_STATUS_T.
-+**
-+** Pid is optional.
-+*/
-+void vcsm_status( VCSM_STATUS_T status, int pid );
-+
-+
-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-+** allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+** 
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc( unsigned int size, char *name );
-+
-+
-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
-+** allocator, the type of caching requested is passed as argument of the
-+** function call.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+** 
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-+
-+
-+/* Shares an allocated block of memory via the vcsm memory allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_share( unsigned int handle );
-+
-+
-+/* Resizes a block of memory allocated previously by vcsm_alloc.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** The handle must be unlocked by user prior to attempting any
-+** resize action.
-+**
-+** On error, the original size allocated against the handle
-+** remains available the same way it would be following a
-+** successful vcsm_malloc.
-+*/
-+int vcsm_resize( unsigned int handle, unsigned int new_size );
-+
-+
-+/* Frees a block of memory that was successfully allocated by
-+** a prior call the vcms_alloc.
-+**
-+** The handle should be considered invalid upon return from this
-+** call.
-+**
-+** Whether any memory is actually freed up or not as the result of
-+** this call will depends on many factors, if all goes well it will
-+** be freed.  If something goes wrong, the memory will likely end up
-+** being freed up as part of the vcsm_exit process.  In the end the
-+** memory is guaranteed to be freed one way or another.
-+*/
-+void vcsm_free( unsigned int handle );
-+
-+
-+/* Retrieves a videocore opaque handle from a mapped user address
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-+
-+
-+/* Retrieves a videocore opaque handle from a opaque handle
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
-+
-+
-+/* Retrieves a user opaque handle from a mapped user address
-+** pointer.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+*/
-+unsigned int vcsm_usr_handle( void *usr_ptr );
-+
-+
-+/* Retrieves a mapped user address from an opaque user
-+** handle.
-+**
-+** Returns:        0 on error
-+**                 a non-zero address on success.
-+**
-+** On success, the address corresponds to the pointer
-+** which can access the data allocated via the vcsm_malloc
-+** call.
-+*/
-+void *vcsm_usr_address( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.  The lock
-+** also gives a chance to update the *host* cache behavior of the
-+** allocated buffer if so desired.  The *videocore* cache behavior
-+** of the allocated buffer cannot be changed by this call and such
-+** attempt will be ignored.
-+**
-+** The system will attempt to honour the cache_update mode request,
-+** the cache_result mode will provide the final answer on which cache
-+** mode is really in use.  Failing to change the cache mode will not
-+** result in a failure to lock the buffer as it is an application
-+** decision to choose what to do if (cache_result != cache_update)
-+**
-+** The value returned in cache_result can only be considered valid if
-+** the returned pointer is non NULL.  The cache_result pointer may be
-+** NULL if the application does not care about the actual outcome of
-+** its action with regards to the cache behavior change.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock_cache( unsigned int handle,
-+                       VCSM_CACHE_TYPE_T cache_update,
-+                       VCSM_CACHE_TYPE_T *cache_result );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr( void *usr_ptr );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl( unsigned int handle );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+
-+/* Clean and/or invalidate the memory associated with this user opaque handle
-+**
-+** Returns:        non-zero on error
-+**
-+** structure contains a list of flush/invalidate commands. Commands are:
-+** 0: nop
-+** 1: invalidate       given virtual range in L1/L2
-+** 2: clean            given virtual range in L1/L2
-+** 3: clean+invalidate given virtual range in L1/L2
-+** 4: flush all L1/L2
-+*/
-+struct vcsm_user_clean_invalid_s {
-+   struct {
-+      unsigned int cmd;
-+      unsigned int handle;
-+      unsigned int addr;
-+      unsigned int size;
-+   } s[8];
-+};
-+
-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif /* __USER_VCSM__H__INCLUDED__ */
-+
-
-From 97b436ca545f9a2faad6fdf02a9668843bd3f324 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 10 Mar 2016 17:56:11 +0000
-Subject: [PATCH 52/93] [rbp] HW mouse pointer
-
-Updating the mouse point provokes a complete screen update which can make it feel laggy
-and results in high cpu.
-
-Render the mouse with an overlay to avoid redrawing the normal gui.
----
- xbmc/guilib/GUIWindowManager.cpp  |   2 +
- xbmc/linux/RBP.cpp                | 137 ++++++++++++++++++++++++++++++++++++++
- xbmc/linux/RBP.h                  |   9 +++
- xbmc/windowing/WinEventsLinux.cpp | 125 ++++++++++++++++++++++++++++++++++
- xbmc/windowing/WinEventsLinux.h   |  11 +++
- 5 files changed, 284 insertions(+)
-
-diff --git a/xbmc/guilib/GUIWindowManager.cpp b/xbmc/guilib/GUIWindowManager.cpp
-index 767c7b6..58a196a 100644
---- a/xbmc/guilib/GUIWindowManager.cpp
-+++ b/xbmc/guilib/GUIWindowManager.cpp
-@@ -192,7 +192,9 @@ void CGUIWindowManager::CreateWindows()
-   Add(new CGUIWindowAddonBrowser);
-   Add(new CGUIWindowScreensaverDim);
-   Add(new CGUIWindowDebugInfo);
-+#ifndef TARGET_RASPBERRY_PI
-   Add(new CGUIWindowPointer);
-+#endif
-   Add(new CGUIDialogYesNo);
-   Add(new CGUIDialogProgress);
-   Add(new CGUIDialogExtendedProgressBar);
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index 257c238..13b0504 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -28,6 +28,9 @@
- 
- #include "cores/omxplayer/OMXImage.h"
- 
-+#include "guilib/GraphicContext.h"
-+#include "settings/DisplaySettings.h"
-+
- #include <sys/ioctl.h>
- #include <linux/ioctl.h>
- #include "rpi_user_vcsm.h"
-@@ -47,6 +50,10 @@ CRBP::CRBP()
-   m_OMX             = new COMXCore();
-   m_display = DISPMANX_NO_HANDLE;
-   m_last_pll_adjust = 1.0;
-+  m_p = NULL;
-+  m_x = 0;
-+  m_y = 0;
-+  m_enabled = 0;
-   m_mb = mbox_open();
-   vcsm_init();
- }
-@@ -129,7 +136,10 @@ void CRBP::LogFirmwareVerison()
- DISPMANX_DISPLAY_HANDLE_T CRBP::OpenDisplay(uint32_t device)
- {
-   if (m_display == DISPMANX_NO_HANDLE)
-+  {
-     m_display = vc_dispmanx_display_open( 0 /*screen*/ );
-+    init_cursor();
-+  }
-   return m_display;
- }
- 
-@@ -138,6 +148,7 @@ void CRBP::CloseDisplay(DISPMANX_DISPLAY_HANDLE_T display)
-   assert(display == m_display);
-   vc_dispmanx_display_close(m_display);
-   m_display = DISPMANX_NO_HANDLE;
-+  uninit_cursor();
- }
- 
- void CRBP::GetDisplaySize(int &width, int &height)
-@@ -238,6 +249,9 @@ void CRBP::Deinitialize()
-   m_omx_image_init  = false;
-   m_initialized     = false;
-   m_omx_initialized = false;
-+  uninit_cursor();
-+  delete m_p;
-+  m_p = NULL;
-   if (m_mb)
-     mbox_close(m_mb);
-   m_mb = 0;
-@@ -323,6 +337,52 @@ unsigned mem_unlock(int file_desc, unsigned handle)
-    return p[5];
- }
- 
-+unsigned int mailbox_set_cursor_info(int file_desc, int width, int height, int format, uint32_t buffer, int hotspotx, int hotspoty)
-+{
-+   int i=0;
-+   unsigned int p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x00008010; // set cursor state
-+   p[i++] = 24; // buffer size
-+   p[i++] = 24; // data size
-+
-+   p[i++] = width;
-+   p[i++] = height;
-+   p[i++] = format;
-+   p[i++] = buffer;           // ptr to VC memory buffer. Doesn't work in 64bit....
-+   p[i++] = hotspotx;
-+   p[i++] = hotspoty;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof(*p); // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+
-+}
-+
-+unsigned int mailbox_set_cursor_position(int file_desc, int enabled, int x, int y)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x00008011; // set cursor state
-+   p[i++] = 12; // buffer size
-+   p[i++] = 12; // data size
-+
-+   p[i++] = enabled;
-+   p[i++] = x;
-+   p[i++] = y;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
- CGPUMEM::CGPUMEM(unsigned int numbytes, bool cached)
- {
-   m_numbytes = numbytes;
-@@ -354,4 +414,81 @@ void CGPUMEM::Flush()
-   vcsm_clean_invalid( &iocache );
- }
- 
-+#define T 0
-+#define W 0xffffffff
-+#define B 0xff000000
-+
-+const static uint32_t default_cursor_pixels[] =
-+{
-+   B,B,B,B,B,B,B,B,B,T,T,T,T,T,T,T,
-+   B,W,W,W,W,W,W,B,T,T,T,T,T,T,T,T,
-+   B,W,W,W,W,W,B,T,T,T,T,T,T,T,T,T,
-+   B,W,W,W,W,B,T,T,T,T,T,T,T,T,T,T,
-+   B,W,W,W,W,W,B,T,T,T,T,T,T,T,T,T,
-+   B,W,W,B,W,W,W,B,T,T,T,T,T,T,T,T,
-+   B,W,B,T,B,W,W,W,B,T,T,T,T,T,T,T,
-+   B,B,T,T,T,B,W,W,W,B,T,T,T,T,T,T,
-+   B,T,T,T,T,T,B,W,W,W,B,T,T,T,T,T,
-+   T,T,T,T,T,T,T,B,W,W,W,B,T,T,T,T,
-+   T,T,T,T,T,T,T,T,B,W,W,W,B,T,T,T,
-+   T,T,T,T,T,T,T,T,T,B,W,W,W,B,T,T,
-+   T,T,T,T,T,T,T,T,T,T,B,W,W,W,B,T,
-+   T,T,T,T,T,T,T,T,T,T,T,B,W,W,W,B,
-+   T,T,T,T,T,T,T,T,T,T,T,T,B,W,B,T,
-+   T,T,T,T,T,T,T,T,T,T,T,T,T,B,T,T
-+};
-+
-+#undef T
-+#undef W
-+#undef B
-+
-+void CRBP::init_cursor()
-+{
-+  if (!m_mb)
-+    return;
-+  if (!m_p)
-+    m_p = new CGPUMEM(64 * 64 * 4, false);
-+  if (m_p && m_p->m_arm && m_p->m_vc)
-+    set_cursor(default_cursor_pixels, 16, 16, 0, 0);
-+}
-+
-+void CRBP::set_cursor(const void *pixels, int width, int height, int hotspot_x, int hotspot_y)
-+{
-+  if (!m_mb || !m_p || !m_p->m_arm || !m_p->m_vc || !pixels || width * height > 64 * 64)
-+    return;
-+  memcpy(m_p->m_arm, pixels, width * height * 4);
-+  unsigned int s = mailbox_set_cursor_info(m_mb, width, height, 0, m_p->m_vc, hotspot_x, hotspot_y);
-+  assert(s == 0);
-+}
-+
-+void CRBP::update_cursor(int x, int y, bool enabled)
-+{
-+  if (!m_mb || !m_p || !m_p->m_arm || !m_p->m_vc)
-+    return;
-+
-+  RESOLUTION res = g_graphicsContext.GetVideoResolution();
-+  CRect gui(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iHeight);
-+  CRect display(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenHeight);
-+
-+  int x2 = x * display.Width()  / gui.Width();
-+  int y2 = y * display.Height() / gui.Height();
-+
-+  if (g_graphicsContext.GetStereoMode() == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+    y2 *= 2;
-+  else if (g_graphicsContext.GetStereoMode() == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-+    x2 *= 2;
-+  if (m_x != x2 || m_y != y2 || m_enabled != enabled)
-+    mailbox_set_cursor_position(m_mb, enabled, x2, y2);
-+  m_x = x2;
-+  m_y = y2;
-+  m_enabled = enabled;
-+}
-+
-+void CRBP::uninit_cursor()
-+{
-+  if (!m_mb || !m_p || !m_p->m_arm || !m_p->m_vc)
-+    return;
-+  mailbox_set_cursor_position(m_mb, 0, 0, 0);
-+}
-+
- #endif
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index ab24bbe..2eee35d 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -99,6 +99,15 @@ private:
-   CCriticalSection m_critSection;
-   double m_last_pll_adjust;
-   int m_mb;
-+  CGPUMEM *m_p;
-+  int m_x;
-+  int m_y;
-+  bool m_enabled;
-+  public:
-+  void init_cursor();
-+  void set_cursor(const void *pixels, int width, int height, int hotspot_x, int hotspot_y);
-+  void update_cursor(int x, int y, bool enabled);
-+  void uninit_cursor();
- };
- 
- extern CRBP g_RBP;
-diff --git a/xbmc/windowing/WinEventsLinux.cpp b/xbmc/windowing/WinEventsLinux.cpp
-index aaba119..2b3d77a 100644
---- a/xbmc/windowing/WinEventsLinux.cpp
-+++ b/xbmc/windowing/WinEventsLinux.cpp
-@@ -30,11 +30,26 @@
- #include "utils/log.h"
- #include "powermanagement/PowerManager.h"
- 
-+#ifdef TARGET_RASPBERRY_PI
-+#include "utils/TimeUtils.h"
-+#include "guilib/Resolution.h"
-+#include "addons/Skin.h"
-+#include "utils/XMLUtils.h"
-+#include "utils/StringUtils.h"
-+#include "guilib/Texture.h"
-+#include "linux/RBP.h"
-+#include "input/InputManager.h"
-+#endif
-+
- bool CWinEventsLinux::m_initialized = false;
- CLinuxInputDevices CWinEventsLinux::m_devices;
- 
- CWinEventsLinux::CWinEventsLinux()
- {
-+#ifdef TARGET_RASPBERRY_PI
-+  m_last_mouse_move_time = 0;
-+  m_mouse_state = -1;
-+#endif
- }
- 
- void CWinEventsLinux::RefreshDevices()
-@@ -48,19 +63,122 @@ bool CWinEventsLinux::IsRemoteLowBattery()
-   return false;
- }
- 
-+#ifdef TARGET_RASPBERRY_PI
-+bool CWinEventsLinux::LoadXML(const std::string strFileName)
-+{
-+  RESOLUTION_INFO m_coordsRes; // resolution that the window coordinates are in.
-+  // Find appropriate skin folder + resolution to load from
-+  std::string strFileNameLower = strFileName;
-+  StringUtils::ToLower(strFileNameLower);
-+  std::string strLowerPath = g_SkinInfo->GetSkinPath(strFileNameLower, &m_coordsRes);
-+  std::string strPath = g_SkinInfo->GetSkinPath(strFileName, &m_coordsRes);
-+
-+  TiXmlElement* pRootElement = NULL;
-+  CXBMCTinyXML xmlDoc;
-+  std::string strPathLower = strPath;
-+  StringUtils::ToLower(strPathLower);
-+  if (!xmlDoc.LoadFile(strPath) && !xmlDoc.LoadFile(strPathLower) && !xmlDoc.LoadFile(strLowerPath))
-+  {
-+    CLog::Log(LOGERROR, "unable to load:%s, Line %d\n%s", strPath.c_str(), xmlDoc.ErrorRow(), xmlDoc.ErrorDesc());
-+    return false;
-+  }
-+  pRootElement = (TiXmlElement*)xmlDoc.RootElement()->Clone();
-+
-+  if (!pRootElement)
-+    return false;
-+
-+  if (strcmpi(pRootElement->Value(), "window"))
-+  {
-+    CLog::Log(LOGERROR, "file : XML file doesnt contain <window>");
-+    return false;
-+  }
-+
-+  TiXmlElement *pChild = pRootElement->FirstChildElement();
-+  while (pChild)
-+  {
-+    if (strcmpi(pChild->Value(), "controls") == 0)
-+    {
-+      TiXmlElement *pControl = pChild->FirstChildElement();
-+      while (pControl)
-+      {
-+        if (strcmpi(pControl->Value(), "control") == 0)
-+        {
-+          std::string strStringValue;
-+          if (XMLUtils::GetString(pControl, "texture", strStringValue))
-+          {
-+            const char* idAttr = pControl->Attribute("id");
-+            int index = idAttr ? atoi(idAttr)-1 : -1;
-+            if (index >= 0 && index < (int)(sizeof m_cursors/sizeof *m_cursors))
-+            {
-+              if (m_cursors[index].m_filename.size())
-+                g_TextureManager.ReleaseTexture(m_cursors[index].m_filename, true);
-+              m_cursors[index].m_filename.clear();
-+              m_cursors[index].m_texture = g_TextureManager.Load(strStringValue);
-+              if (m_cursors[index].m_texture.size())
-+                m_cursors[index].m_filename = strStringValue;
-+            }
-+          }
-+        }
-+        pControl = pControl->NextSiblingElement();
-+      }
-+    }
-+    pChild = pChild->NextSiblingElement();
-+  }
-+  delete pRootElement;
-+  return true;
-+}
-+#endif
-+
- bool CWinEventsLinux::MessagePump()
- {
-   if (!m_initialized)
-   {
-     m_devices.InitAvailable();
-     m_initialized = true;
-+#ifdef TARGET_RASPBERRY_PI
-+    LoadXML("Pointer.xml");
-+#endif
-   }
- 
-   bool ret = false;
-   XBMC_Event event = {0};
-+#ifdef TARGET_RASPBERRY_PI
-+  bool active = CInputManager::GetInstance().IsMouseActive();
-+  int64_t Now = CurrentHostCounter();
-+  if (!active)
-+  {
-+    if (m_mouse_state != -1)
-+    {
-+      g_RBP.update_cursor(0, 0, 0);
-+      m_mouse_state = -1;
-+    }
-+  }
-+  else
-+  {
-+    int state = CInputManager::GetInstance().GetMouseState() - 1;
-+    if (m_mouse_state != state)
-+    {
-+      if (state >= 0 && state < (int)(sizeof m_cursors/sizeof *m_cursors))
-+      {
-+        CBaseTexture *t = (m_cursors[state].m_texture.m_textures)[0];
-+        if (t)
-+          g_RBP.set_cursor((const void *)t->GetPixels(), t->GetPitch()>>2, t->GetRows(), 0, 0);
-+      }
-+      m_mouse_state = state;
-+    }
-+  }
-+#endif
-   while (1)
-   {
-     event = m_devices.ReadEvent();
-+#ifdef TARGET_RASPBERRY_PI
-+    if (active && (event.type == XBMC_MOUSEMOTION || event.type == XBMC_MOUSEBUTTONDOWN || event.type == XBMC_MOUSEBUTTONUP))
-+    {
-+      if (event.type == XBMC_MOUSEMOTION)
-+        g_RBP.update_cursor(event.motion.x, event.motion.y, 1);
-+      m_last_mouse_move_time = Now;
-+    }
-+#endif
-     if (event.type != XBMC_NOEVENT)
-     {
-       ret |= g_application.OnEvent(event);
-@@ -71,6 +189,13 @@ bool CWinEventsLinux::MessagePump()
-     }
-   }
- 
-+#ifdef TARGET_RASPBERRY_PI
-+  if (active && Now - m_last_mouse_move_time > 5 * 1000000000LL)
-+  {
-+    g_RBP.update_cursor(0, 0, 0);
-+    m_mouse_state = -1;
-+  }
-+#endif
-   return ret;
- }
- 
-diff --git a/xbmc/windowing/WinEventsLinux.h b/xbmc/windowing/WinEventsLinux.h
-index a17e987..23244a2 100644
---- a/xbmc/windowing/WinEventsLinux.h
-+++ b/xbmc/windowing/WinEventsLinux.h
-@@ -24,6 +24,7 @@
- #pragma once
- #include "windowing/WinEvents.h"
- #include "input/linux/LinuxInputDevices.h"
-+#include "guilib/TextureManager.h"
- 
- class CWinEventsLinux : public IWinEvents
- {
-@@ -43,6 +44,16 @@ public:
- private:
-   static bool m_initialized;
-   static CLinuxInputDevices m_devices;
-+#ifdef TARGET_RASPBERRY_PI
-+  bool LoadXML(const std::string strFileName);
-+  int64_t m_last_mouse_move_time;
-+  struct
-+  {
-+    std::string m_filename;
-+    CTextureArray m_texture;
-+  } m_cursors[4];
-+  int m_mouse_state;
-+#endif
- };
- 
- #endif
-
-From fbd04377a1dac080166e1e4baa2250f402e3b66f Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 2 Aug 2014 17:48:04 +0100
-Subject: [PATCH 53/93] [omx] Report decoded image name
-
----
- xbmc/cores/omxplayer/OMXImage.cpp | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/xbmc/cores/omxplayer/OMXImage.cpp b/xbmc/cores/omxplayer/OMXImage.cpp
-index 1524214..a01c435 100644
---- a/xbmc/cores/omxplayer/OMXImage.cpp
-+++ b/xbmc/cores/omxplayer/OMXImage.cpp
-@@ -327,6 +327,7 @@ bool COMXImage::DecodeJpegToTexture(COMXImageFile *file, unsigned int width, uns
-   {
-     ret = true;
-     *userdata = tex;
-+    CLog::Log(LOGDEBUG, "%s: decoded %s %dx%d", __func__, file->GetFilename(), width, height);
-   }
-   else
-   {
-
-From 99d06dd14a4501fe81b36e8ce3966dc99cd04b94 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 11 Apr 2014 16:12:27 +0100
-Subject: [PATCH 54/93] [omxplayer] Add ability to log more timestamp info in
- extra debug settings
-
----
- addons/resource.language.en_gb/resources/strings.po |  5 +++++
- xbmc/commons/ilog.h                                 |  1 +
- xbmc/cores/omxplayer/OMXHelper.cpp                  | 12 +++++++-----
- xbmc/cores/omxplayer/OMXPlayerAudio.cpp             |  8 ++++----
- xbmc/cores/omxplayer/OMXPlayerVideo.cpp             |  9 +++++----
- xbmc/settings/AdvancedSettings.cpp                  |  3 +++
- 6 files changed, 25 insertions(+), 13 deletions(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index a1da64b..1fb7988 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -2989,6 +2989,11 @@ msgctxt "#680"
- msgid "Verbose logging for the [B]Video[/B] component"
- msgstr ""
- 
-+#: xbmc/settings/AdvancedSettings.cpp
-+msgctxt "#697"
-+msgid "Verbose logging for OMXPLAYER"
-+msgstr ""
-+
- #empty strings from id 681 to 699
- 
- msgctxt "#700"
-diff --git a/xbmc/commons/ilog.h b/xbmc/commons/ilog.h
-index de90359..e4ffb5e 100644
---- a/xbmc/commons/ilog.h
-+++ b/xbmc/commons/ilog.h
-@@ -53,6 +53,7 @@
- #define LOGUPNP     (1 << (LOGMASKBIT + 8))
- #define LOGCEC      (1 << (LOGMASKBIT + 9))
- #define LOGVIDEO    (1 << (LOGMASKBIT + 10))
-+#define LOGOMXPLAYER (1 << (LOGMASKBIT + 16))
- 
- #include "utils/params_check_macros.h"
- 
-diff --git a/xbmc/cores/omxplayer/OMXHelper.cpp b/xbmc/cores/omxplayer/OMXHelper.cpp
-index 3429cea..59c3a61 100644
---- a/xbmc/cores/omxplayer/OMXHelper.cpp
-+++ b/xbmc/cores/omxplayer/OMXHelper.cpp
-@@ -23,6 +23,7 @@
- #ifdef HAS_OMXPLAYER
- 
- #include "DVDPlayer.h"
-+#include "settings/AdvancedSettings.h"
- #include "settings/Settings.h"
- #include "settings/MediaSettings.h"
- #include "DVDInputStreams/DVDInputStream.h"
-@@ -155,7 +156,8 @@ bool OMXDoProcessing(struct SOmxPlayerState &m_OmxPlayerState, int m_playSpeed,
-     m_OmxPlayerState.video_fifo = (int)(100.0*(m_dvdPlayerVideo->GetDecoderBufferSize()-m_dvdPlayerVideo->GetDecoderFreeSpace())/m_dvdPlayerVideo->GetDecoderBufferSize());
-     m_OmxPlayerState.audio_fifo = (int)(100.0*audio_fifo/m_dvdPlayerAudio->GetCacheTotal());
- 
--    #ifdef _DEBUG
-+  if (g_advancedSettings.CanLogComponent(LOGOMXPLAYER))
-+  {
-     static unsigned count;
-     if ((count++ & 7) == 0)
-     {
-@@ -175,7 +177,7 @@ bool OMXDoProcessing(struct SOmxPlayerState &m_OmxPlayerState, int m_playSpeed,
-       vc_gencmd(response, sizeof response, "render_bar 7 audio_queue %d %d %d %d",
-             m_dvdPlayerAudio->GetLevel(), 0, 0, 100);
-     }
--    #endif
-+  }
-     if (audio_pts != DVD_NOPTS_VALUE)
-     {
-       audio_fifo_low = m_HasAudio && audio_fifo < threshold;
-@@ -191,15 +193,15 @@ bool OMXDoProcessing(struct SOmxPlayerState &m_OmxPlayerState, int m_playSpeed,
-     if (!m_HasVideo && m_HasAudio)
-       video_fifo_high = true;
- 
--    #ifdef _DEBUG
-+  if (g_advancedSettings.CanLogComponent(LOGOMXPLAYER))
-+  {
-     CLog::Log(LOGDEBUG, "%s::%s M:%.6f-%.6f (A:%.6f V:%.6f) PEF:%d%d%d S:%.2f A:%.2f V:%.2f/T:%.2f (A:%d%d V:%d%d) A:%d%% V:%d%% (%.2f,%.2f)", "CDVDPlayer", __FUNCTION__,
-       m_OmxPlayerState.stamp*1e-6, m_OmxPlayerState.av_clock.OMXClockAdjustment()*1e-6, audio_pts*1e-6, video_pts*1e-6,
-       m_OmxPlayerState.av_clock.OMXIsPaused(), m_OmxPlayerState.bOmxSentEOFs, not_accepts_data, m_playSpeed * (1.0f/DVD_PLAYSPEED_NORMAL),
-       audio_pts == DVD_NOPTS_VALUE ? 0.0:audio_fifo, video_pts == DVD_NOPTS_VALUE ? 0.0:video_fifo, m_OmxPlayerState.threshold,
-       audio_fifo_low, audio_fifo_high, video_fifo_low, video_fifo_high,
-       m_dvdPlayerAudio->GetLevel(), m_dvdPlayerVideo->GetLevel(), m_dvdPlayerAudio->GetDelay(), (float)m_dvdPlayerAudio->GetCacheTotal());
--    #endif
--
-+  }
-     if(!m_Pause && (m_OmxPlayerState.bOmxSentEOFs || not_accepts_data || (audio_fifo_high && video_fifo_high) || m_playSpeed != DVD_PLAYSPEED_NORMAL))
-     {
-       if (m_OmxPlayerState.av_clock.OMXIsPaused())
-diff --git a/xbmc/cores/omxplayer/OMXPlayerAudio.cpp b/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-index 1c3b904..2056962 100644
---- a/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-+++ b/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-@@ -366,10 +366,10 @@ void OMXPlayerAudio::Process()
-       DemuxPacket* pPacket = ((CDVDMsgDemuxerPacket*)pMsg)->GetPacket();
-       bool bPacketDrop     = ((CDVDMsgDemuxerPacket*)pMsg)->GetPacketDrop();
- 
--      #ifdef _DEBUG
--      CLog::Log(LOGINFO, "Audio: dts:%.0f pts:%.0f size:%d (s:%d f:%d d:%d l:%d) s:%d %d/%d late:%d,%d", pPacket->dts, pPacket->pts,
--           (int)pPacket->iSize, m_started, m_flush, bPacketDrop, m_stalled, m_speed, 0, 0, (int)m_omxAudio.GetAudioRenderingLatency(), (int)m_hints_current.samplerate);
--      #endif
-+      if (g_advancedSettings.CanLogComponent(LOGOMXPLAYER))
-+        CLog::Log(LOGINFO, "Audio: dts:%.0f pts:%.0f size:%d (s:%d f:%d d:%d l:%d) s:%d %d/%d late:%d,%d", pPacket->dts, pPacket->pts,
-+             (int)pPacket->iSize, m_started, m_flush, bPacketDrop, m_stalled, m_speed, 0, 0, (int)m_omxAudio.GetAudioRenderingLatency(), (int)m_hints_current.samplerate);
-+
-       if(Decode(pPacket, m_speed > DVD_PLAYSPEED_NORMAL || m_speed < 0 || bPacketDrop))
-       {
-         // we are not running until something is cached in output device
-diff --git a/xbmc/cores/omxplayer/OMXPlayerVideo.cpp b/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-index 0e04360..7c34e10 100644
---- a/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-@@ -42,6 +42,7 @@
- #include "DVDOverlayRenderer.h"
- #include "settings/DisplaySettings.h"
- #include "settings/Settings.h"
-+#include "settings/AdvancedSettings.h"
- #include "settings/MediaSettings.h"
- #include "cores/VideoRenderers/RenderFormats.h"
- #include "cores/VideoRenderers/RenderFlags.h"
-@@ -452,10 +453,10 @@ void OMXPlayerVideo::Process()
-       DemuxPacket* pPacket = ((CDVDMsgDemuxerPacket*)pMsg)->GetPacket();
-       bool bPacketDrop     = ((CDVDMsgDemuxerPacket*)pMsg)->GetPacketDrop();
- 
--      #ifdef _DEBUG
--      CLog::Log(LOGINFO, "Video: dts:%.0f pts:%.0f size:%d (s:%d f:%d d:%d l:%d) s:%d %d/%d late:%d\n", pPacket->dts, pPacket->pts, 
--          (int)pPacket->iSize, m_started, m_flush, bPacketDrop, m_stalled, m_speed, 0, 0, 0);
--      #endif
-+      if (g_advancedSettings.CanLogComponent(LOGOMXPLAYER))
-+        CLog::Log(LOGINFO, "Video: dts:%.0f pts:%.0f size:%d (s:%d f:%d d:%d l:%d) s:%d %d/%d late:%d\n", pPacket->dts, pPacket->pts,
-+            (int)pPacket->iSize, m_started, m_flush, bPacketDrop, m_stalled, m_speed, 0, 0, 0);
-+
-       if (m_messageQueue.GetDataSize() == 0
-       ||  m_speed < 0)
-       {
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 22b8459..8045a03 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -1359,6 +1359,9 @@ void CAdvancedSettings::SettingOptionsLoggingComponentsFiller(const CSetting *se
- #ifdef HAVE_LIBCEC
-   list.push_back(std::make_pair(g_localizeStrings.Get(679), LOGCEC));
- #endif
-+#ifdef TARGET_RASPBERRY_PI
-+  list.push_back(std::make_pair(g_localizeStrings.Get(697), LOGOMXPLAYER));
-+#endif
- }
- 
- void CAdvancedSettings::setExtraLogLevel(const std::vector<CVariant> &components)
-
-From 85731b224b68bac5a47774d5447bbd3e1d14236d Mon Sep 17 00:00:00 2001
-From: Memphiz <memphis@machzwo.de>
-Date: Tue, 18 Nov 2014 13:28:36 +0100
-Subject: [PATCH 55/93] - evaluate the setting for hiding watched
- movies/episodes/musicvideos in recently added job (should influence
- homescreen of skins only)
-
----
- xbmc/utils/RecentlyAddedJob.cpp | 10 +++++-----
- xbmc/video/VideoDatabase.cpp    | 27 ++++++++++++++++++++++++---
- xbmc/video/VideoDatabase.h      |  6 +++---
- 3 files changed, 32 insertions(+), 11 deletions(-)
-
-diff --git a/xbmc/utils/RecentlyAddedJob.cpp b/xbmc/utils/RecentlyAddedJob.cpp
-index de98a51..19eea07 100644
---- a/xbmc/utils/RecentlyAddedJob.cpp
-+++ b/xbmc/utils/RecentlyAddedJob.cpp
-@@ -30,6 +30,7 @@
- #include "music/tags/MusicInfoTag.h"
- #include "utils/StringUtils.h"
- #include "settings/AdvancedSettings.h"
-+#include "settings/Settings.h"
- #include "music/MusicThumbLoader.h"
- #include "video/VideoThumbLoader.h"
- 
-@@ -56,8 +57,8 @@ bool CRecentlyAddedJob::UpdateVideo()
-   loader.OnLoaderStart();
-   
-   videodatabase.Open();
--
--  if (videodatabase.GetRecentlyAddedMoviesNav("videodb://recentlyaddedmovies/", items, NUM_ITEMS))
-+  bool hideWatched = CSettings::GetInstance().GetBool("videolibrary.hiderecentlywatchedvideos");
-+  if (videodatabase.GetRecentlyAddedMoviesNav("videodb://recentlyaddedmovies/", items, NUM_ITEMS, hideWatched))
-   {  
-     for (; i < items.Size(); ++i)
-     {
-@@ -96,8 +97,7 @@ bool CRecentlyAddedJob::UpdateVideo()
-  
-   i = 0;
-   CFileItemList  TVShowItems; 
-- 
--  if (videodatabase.GetRecentlyAddedEpisodesNav("videodb://recentlyaddedepisodes/", TVShowItems, NUM_ITEMS))
-+  if (videodatabase.GetRecentlyAddedEpisodesNav("videodb://recentlyaddedepisodes/", TVShowItems, NUM_ITEMS, hideWatched))
-   {
-     for (; i < TVShowItems.Size(); ++i)
-     {    
-@@ -150,7 +150,7 @@ bool CRecentlyAddedJob::UpdateVideo()
-   i = 0;
-   CFileItemList MusicVideoItems;
- 
--  if (videodatabase.GetRecentlyAddedMusicVideosNav("videodb://recentlyaddedmusicvideos/", MusicVideoItems, NUM_ITEMS))
-+  if (videodatabase.GetRecentlyAddedMusicVideosNav("videodb://recentlyaddedmusicvideos/", MusicVideoItems, NUM_ITEMS, hideWatched))
-   {
-     for (; i < MusicVideoItems.Size(); ++i)
-     {
-diff --git a/xbmc/video/VideoDatabase.cpp b/xbmc/video/VideoDatabase.cpp
-index b56e2e8..6db3c7e 100644
---- a/xbmc/video/VideoDatabase.cpp
-+++ b/xbmc/video/VideoDatabase.cpp
-@@ -6466,27 +6466,48 @@ bool CVideoDatabase::GetMusicVideosNav(const std::string& strBaseDir, CFileItemL
-   return GetMusicVideosByWhere(videoUrl.ToString(), filter, items, true, sortDescription);
- }
- 
--bool CVideoDatabase::GetRecentlyAddedMoviesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit)
-+bool CVideoDatabase::GetRecentlyAddedMoviesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit, bool hideWatched)
- {
-   Filter filter;
-   filter.order = "dateAdded desc, idMovie desc";
-   filter.limit = PrepareSQL("%u", limit ? limit : g_advancedSettings.m_iVideoLibraryRecentlyAddedItems);
-+
-+  if (hideWatched)
-+  {
-+    filter.AppendWhere("playCount <= 0");// only query unwatched items
-+    filter.AppendWhere("playCount IS NULL", false);
-+  }
-+
-   return GetMoviesByWhere(strBaseDir, filter, items);
- }
- 
--bool CVideoDatabase::GetRecentlyAddedEpisodesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit)
-+bool CVideoDatabase::GetRecentlyAddedEpisodesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit, bool hideWatched)
- {
-   Filter filter;
-   filter.order = "dateAdded desc, idEpisode desc";
-   filter.limit = PrepareSQL("%u", limit ? limit : g_advancedSettings.m_iVideoLibraryRecentlyAddedItems);
-+
-+  if (hideWatched)
-+  {
-+    filter.AppendWhere("playCount <= 0");// only query unwatched items
-+    filter.AppendWhere("playCount IS NULL", false);
-+  }
-+
-   return GetEpisodesByWhere(strBaseDir, filter, items, false);
- }
- 
--bool CVideoDatabase::GetRecentlyAddedMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit)
-+bool CVideoDatabase::GetRecentlyAddedMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit, bool hideWatched)
- {
-   Filter filter;
-   filter.order = "dateAdded desc, idMVideo desc";
-   filter.limit = PrepareSQL("%u", limit ? limit : g_advancedSettings.m_iVideoLibraryRecentlyAddedItems);
-+
-+  if (hideWatched)
-+  {
-+    filter.AppendWhere("playCount <= 0");// only query unwatched items
-+    filter.AppendWhere("playCount IS NULL", false);
-+  }
-+
-   return GetMusicVideosByWhere(strBaseDir, filter, items);
- }
- 
-diff --git a/xbmc/video/VideoDatabase.h b/xbmc/video/VideoDatabase.h
-index 2021dd9..5f67d10 100644
---- a/xbmc/video/VideoDatabase.h
-+++ b/xbmc/video/VideoDatabase.h
-@@ -693,9 +693,9 @@ public:
-   bool GetEpisodesNav(const std::string& strBaseDir, CFileItemList& items, int idGenre=-1, int idYear=-1, int idActor=-1, int idDirector=-1, int idShow=-1, int idSeason=-1, const SortDescription &sortDescription = SortDescription());
-   bool GetMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, int idGenre=-1, int idYear=-1, int idArtist=-1, int idDirector=-1, int idStudio=-1, int idAlbum=-1, int idTag=-1, const SortDescription &sortDescription = SortDescription());
-   
--  bool GetRecentlyAddedMoviesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0);
--  bool GetRecentlyAddedEpisodesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0);
--  bool GetRecentlyAddedMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0);
-+  bool GetRecentlyAddedMoviesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0, bool hideWatched=false);
-+  bool GetRecentlyAddedEpisodesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0, bool hideWatched=false);
-+  bool GetRecentlyAddedMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0, bool hideWatched=false);
- 
-   bool HasContent();
-   bool HasContent(VIDEODB_CONTENT_TYPE type);
-
-From 11a94854f5d433c121314cdf989fd0e7bcc0102b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 7 May 2015 14:28:37 +0100
-Subject: [PATCH 56/93] build: Add vcsm lib
-
----
- configure.ac | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/configure.ac b/configure.ac
-index 7a06a31..239a2a1 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -1012,7 +1012,7 @@ if test "$use_gles" = "yes"; then
-       AC_DEFINE([HAVE_LIBEGL],[1],["Define to 1 if you have the `EGL' library (-lEGL)."])
-       AC_DEFINE([HAVE_LIBGLESV2],[1],["Define to 1 if you have the `GLESv2' library (-lGLESv2)."])
-       AC_MSG_RESULT(== WARNING: OpenGLES support is assumed.)
--      LIBS="$LIBS -lEGL -lGLESv2 -lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util"
-+      LIBS="$LIBS -lEGL -lGLESv2 -lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm"
-     else
-       AC_CHECK_LIB([EGL],   [main],, AC_MSG_ERROR($missing_library))
-       AC_CHECK_LIB([GLESv2],[main],, AC_MSG_ERROR($missing_library))
-
-From 0077cb637667dc35084234c792dedfc8c5e80485 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 8 May 2015 14:09:31 +0100
-Subject: [PATCH 57/93] [cec] re-implement 'RFC' style POLLing for LA
- registering process
-
----
- ...t-RFC-style-POLLing-for-LA-registering-pr.patch | 194 +++++++++++++++++++++
- tools/depends/target/libcec/Makefile               |   1 +
- 2 files changed, 195 insertions(+)
- create mode 100644 tools/depends/target/libcec/0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
-
-diff --git a/tools/depends/target/libcec/0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch b/tools/depends/target/libcec/0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
-new file mode 100644
-index 0000000..24bf69f
---- /dev/null
-+++ b/tools/depends/target/libcec/0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
-@@ -0,0 +1,194 @@
-+From 8b8b9cd9d3821514d02d53214cf65af5b54543ed Mon Sep 17 00:00:00 2001
-+From: Matus Kral <matuskral@me.com>
-+Date: Fri, 8 May 2015 14:48:48 +0200
-+Subject: [PATCH] re-implement 'RFC' style POLLing for LA registering process
-+ (org A = dest A)
-+
-+- initially, sending empty asymetric msg (from X to Y) seemed to
-+  deliver same result. It is not - there are devices responding with NACK
-+  to msg send attempt in case they are busy (already receiving msg).
-+  ACK is returned only on 'RFC' POLL msg.
-+---
-+ .../adapter/RPi/RPiCECAdapterCommunication.cpp     | 49 +++++++++++++++++++++-
-+ .../adapter/RPi/RPiCECAdapterCommunication.h       |  3 ++
-+ .../adapter/RPi/RPiCECAdapterMessageQueue.cpp      | 40 ++++++++++++++----
-+ 3 files changed, 81 insertions(+), 11 deletions(-)
-+
-+diff --git a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-+index 1e93838..6f0804d 100644
-+--- a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-++++ b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-+@@ -70,7 +70,8 @@ CRPiCECAdapterCommunication::CRPiCECAdapterCommunication(IAdapterCommunicationCa
-+     m_logicalAddress(CECDEVICE_UNKNOWN),
-+     m_bLogicalAddressChanged(false),
-+     m_previousLogicalAddress(CECDEVICE_FREEUSE),
-+-    m_bLogicalAddressRegistered(false)
-++    m_bLogicalAddressRegistered(false),
-++    m_bDisableCallbacks(false)
-+ {
-+   m_queue = new CRPiCECAdapterMessageQueue(this);
-+ }
-+@@ -140,6 +141,12 @@ void CRPiCECAdapterCommunication::OnTVServiceCallback(uint32_t reason, uint32_t
-+ 
-+ void CRPiCECAdapterCommunication::OnDataReceived(uint32_t header, uint32_t p0, uint32_t p1, uint32_t p2, uint32_t p3)
-+ {
-++  {
-++    CLockObject lock(m_mutex);
-++    if (m_bDisableCallbacks)
-++      return;
-++  }
-++
-+   VC_CEC_NOTIFY_T reason = (VC_CEC_NOTIFY_T)CEC_CB_REASON(header);
-+ 
-+ #ifdef CEC_DEBUGGING
-+@@ -363,12 +370,50 @@ std::string CRPiCECAdapterCommunication::GetError(void) const
-+   return strError;
-+ }
-+ 
-++void CRPiCECAdapterCommunication::SetDisableCallback(const bool disable)
-++{
-++  CLockObject lock(m_mutex);
-++  m_bDisableCallbacks = disable;
-++}
-++
-+ cec_adapter_message_state CRPiCECAdapterCommunication::Write(const cec_command &data, bool &bRetry, uint8_t iLineTimeout, bool bIsReply)
-+ {
-+   VC_CEC_ERROR_T vcAnswer;
-+   uint32_t iTimeout = (data.transmit_timeout ? data.transmit_timeout : iLineTimeout*1000);
-++  cec_adapter_message_state rc;
-++
-++  // to send a real POLL (dest & source LA the same - eg 11), VC
-++  // needs us to be in passivemode(we are) and with no actual LA
-++  // registered
-++  // libCEC sends 'true' POLLs only when at LA choosing process.
-++  // any other POLLing of devices happens with regular 'empty'
-++  // msg (just header, no OPCODE) with actual LA as source to X.
-++  // for us this means, that libCEC already registered tmp LA
-++  // (0xf, 0xe respectively) before it calls us for LA POLLing.
-++  //
-++  // that means - unregistering any A from adapter, _while_
-++  // ignoring callbacks (and especialy not reporting the
-++  // subsequent actions generated from VC layer - like
-++  // LA change to 0xf ...)
-++  //
-++  // calling vc_cec_release_logical_address() over and over is
-++  // fine.
-++  // once libCEC gets NACK on tested A, it calls RegisterLogicalAddress()
-++  // on it's own - so we don't need to take care of re-registering
-++  if (!data.opcode_set && data.initiator == data.destination)
-++  {
-++    SetDisableCallback(true);
-++
-++    vc_cec_release_logical_address();
-++    // accept nothing else than NACK or ACK, repeat until this happens
-++    while (ADAPTER_MESSAGE_STATE_WAITING_TO_BE_SENT ==
-++          (rc = m_queue->Write(data, bRetry, iTimeout, bIsReply, vcAnswer)));
-++
-++    SetDisableCallback(false);
-++    return rc;
-++  }
-+ 
-+-  cec_adapter_message_state rc = m_queue->Write(data, bRetry, iTimeout, bIsReply, vcAnswer);
-++  rc = m_queue->Write(data, bRetry, iTimeout, bIsReply, vcAnswer);
-+ #ifdef CEC_DEBUGGING
-+   LIB_CEC->AddLog(CEC_LOG_DEBUG, "sending data: result %s", ToString(vcAnswer));
-+ #endif
-+diff --git a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.h b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.h
-+index ba6d000..6024a27 100644
-+--- a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.h
-++++ b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.h
-+@@ -100,6 +100,7 @@ namespace CEC
-+     bool UnregisterLogicalAddress(void);
-+     bool RegisterLogicalAddress(const cec_logical_address address, uint32_t iTimeoutMs = CEC_DEFAULT_CONNECT_TIMEOUT);
-+     int InitHostCEC(void);
-++    void SetDisableCallback(const bool disable);
-+ 
-+     bool m_bInitialised;   /**< true when the connection is initialised, false otherwise */
-+     std::string m_strError; /**< current error message */
-+@@ -113,6 +114,8 @@ namespace CEC
-+     VCHI_CONNECTION_T *         m_vchi_connection;
-+     cec_logical_address         m_previousLogicalAddress;
-+     bool                        m_bLogicalAddressRegistered;
-++
-++    bool                        m_bDisableCallbacks;
-+   };
-+ };
-+ 
-+diff --git a/src/libcec/adapter/RPi/RPiCECAdapterMessageQueue.cpp b/src/libcec/adapter/RPi/RPiCECAdapterMessageQueue.cpp
-+index 361ba38..169201d 100644
-+--- a/src/libcec/adapter/RPi/RPiCECAdapterMessageQueue.cpp
-++++ b/src/libcec/adapter/RPi/RPiCECAdapterMessageQueue.cpp
-+@@ -53,10 +53,13 @@ using namespace PLATFORM;
-+ 
-+ #define LIB_CEC m_com->m_callback->GetLib()
-+ 
-++// initialise new msg with unsuccesfull status, also
-++// set default return state to "UNKNOWN" - instead
-++// of NACK (which has special meaning for dev POLLing)
-+ CRPiCECAdapterMessageQueueEntry::CRPiCECAdapterMessageQueueEntry(CRPiCECAdapterMessageQueue *queue, const cec_command &command) :
-+     m_queue(queue),
-+     m_command(command),
-+-    m_retval(VC_CEC_ERROR_NO_ACK),
-++    m_retval(VC_CEC_ERROR_BUSY),
-+     m_bSucceeded(false)
-+ {
-+ 
-+@@ -130,6 +133,27 @@ uint32_t CRPiCECAdapterMessageQueueEntry::Result() const
-+ 
-+ cec_adapter_message_state CRPiCECAdapterMessageQueue::Write(const cec_command &command, bool &bRetry, uint32_t iLineTimeout, bool bIsReply, VC_CEC_ERROR_T &vcReply)
-+ {
-++  // handle POLL (msg like '11') in a special way - the way it was
-++  // originally designed by BCM, expected to happen and documented
-++  // in API docs (/opt/vc/includes)
-++  // due to often (more than 20% test cases - CEC bus with 8 devices)
-++  // irregularities on returned status, repeat until we get SAME
-++  // result twice in a row
-++  if (!command.opcode_set && command.destination == command.initiator)
-++  {
-++    int iReturnPrev = -1;
-++    int iReturn = 0;
-++
-++    while((iReturn = vc_cec_poll_address((CEC_AllDevices_T)command.destination)) != iReturnPrev)
-++      iReturnPrev = iReturn;
-++    if (iReturn == 0)
-++      return ADAPTER_MESSAGE_STATE_SENT_ACKED;
-++    else if (iReturn > 0)
-++      return ADAPTER_MESSAGE_STATE_SENT_NOT_ACKED;
-++    else
-++      return ADAPTER_MESSAGE_STATE_WAITING_TO_BE_SENT;
-++  }
-++
-+   CRPiCECAdapterMessageQueueEntry *entry = new CRPiCECAdapterMessageQueueEntry(this, command);
-+   uint64_t iEntryId(0);
-+   /* add to the wait for ack queue */
-+@@ -192,8 +216,9 @@ cec_adapter_message_state CRPiCECAdapterMessageQueue::Write(const cec_command &c
-+   bRetry = false;
-+   if (iReturn != VCHIQ_SUCCESS)
-+   {
-+-    LIB_CEC->AddLog(CEC_LOG_DEBUG, "sending command '%s' failed (%d)", command.opcode_set ? CCECTypeUtils::ToString(command.opcode) : "POLL", iReturn);
-+-    delete (entry);
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "sending command '%s' failed (%d)", CCECTypeUtils::ToString(command.opcode), iReturn);
-++    delete entry;
-++    m_messages.erase(iEntryId);
-+     return ADAPTER_MESSAGE_STATE_ERROR;
-+   }
-+ 
-+@@ -213,12 +238,9 @@ cec_adapter_message_state CRPiCECAdapterMessageQueue::Write(const cec_command &c
-+     }
-+     else
-+     {
-+-      if (command.opcode_set)
-+-      {
-+-        bRetry = true;
-+-        LIB_CEC->AddLog(CEC_LOG_DEBUG, "command '%s' timeout", command.opcode_set ? CCECTypeUtils::ToString(command.opcode) : "POLL");
-+-        sleep(CEC_DEFAULT_TRANSMIT_RETRY_WAIT);
-+-      }
-++      bRetry = true;
-++      LIB_CEC->AddLog(CEC_LOG_DEBUG, "command '%s' timeout", CCECTypeUtils::ToString(command.opcode));
-++      sleep(CEC_DEFAULT_TRANSMIT_RETRY_WAIT);
-+       bReturn = ADAPTER_MESSAGE_STATE_WAITING_TO_BE_SENT;
-+     }
-+ 
-+-- 
-+1.9.1
-+
-diff --git a/tools/depends/target/libcec/Makefile b/tools/depends/target/libcec/Makefile
-index ddf9963..5d1f933 100644
---- a/tools/depends/target/libcec/Makefile
-+++ b/tools/depends/target/libcec/Makefile
-@@ -22,6 +22,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	rm -rf $(PLATFORM); mkdir -p $(PLATFORM)/build
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
- 	cd $(PLATFORM); patch -p1 < ../popcornmix.patch
-+	cd $(PLATFORM); patch -p1 < ../0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
- 	cd $(PLATFORM)/build; $(CMAKE) -DBUILD_SHARED_LIBS=1 -DSKIP_PYTHON_WRAPPER:STRING=1 -DCMAKE_INSTALL_LIBDIR=$(PREFIX)/lib ..
- 
- $(LIBDYLIB): $(PLATFORM)
-
-From 3916ef0e55ad307d7a3e0f88ba5df0cdc73d5477 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 7 May 2015 15:35:43 +0100
-Subject: [PATCH 58/93] ffmpeg: test: increase number of threads
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-index 3498503..c2f3287 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-@@ -265,6 +265,9 @@ bool CDVDVideoCodecFFmpeg::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options
-     else
-     {
-       int num_threads = std::min(8 /*MAX_THREADS*/, g_cpuInfo.getCPUCount());
-+#ifdef TARGET_RASPBERRY_PI
-+      num_threads = num_threads > 1 ? 2 * num_threads : num_threads;
-+#endif
-       if( num_threads > 1)
-         m_pCodecContext->thread_count = num_threads;
-       m_pCodecContext->thread_safe_callbacks = 1;
-
-From 36fd4c27fe9af15d65461e32b8d105e00fd8df52 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 22 May 2015 13:56:29 +0100
-Subject: [PATCH 59/93] ffmpeg: Allow neon to be enabled in unified builds
-
----
- tools/depends/target/ffmpeg/Makefile | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index 6a9f105..fef5ef2 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -23,7 +23,11 @@ ffmpg_config += --enable-gnutls
- ffmpg_config += --enable-libdcadec
- 
- ifeq ($(CROSS_COMPILING), yes)
-+ ifeq ($(CPU), cortex-a7)
-+  ffmpg_config += --arch=arm --enable-cross-compile
-+ else
-   ffmpg_config += --arch=$(CPU) --enable-cross-compile
-+ endif
- endif
- ifeq ($(OS), linux)
-   ffmpg_config += --target-os=$(OS) --cpu=$(CPU)
-
-From 5d5698967a69035d742d55f8986bce84831e73e9 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 5 Mar 2015 20:00:59 +0000
-Subject: [PATCH 60/93] [ffmpmeg] Discard data before VO/VOL in mpeg-4 over
- mpegts
-
----
- ...-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch | 69 ++++++++++++++++++++++
- tools/depends/target/ffmpeg/Makefile               |  4 +-
- 2 files changed, 72 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-
-diff --git a/tools/depends/target/ffmpeg/0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch b/tools/depends/target/ffmpeg/0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-new file mode 100644
-index 0000000..eef7385
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-@@ -0,0 +1,69 @@
-+From ff289b3678b3b102f76c0fc0ffc802e3c8026fdb Mon Sep 17 00:00:00 2001
-+From: Deborah Crook <deborah@kynesim.co.uk>
-+Date: Thu, 5 Mar 2015 19:48:43 +0000
-+Subject: [PATCH] Discard data before VO/VOL in mpeg-4 over mpegts
-+
-+---
-+ libavcodec/mpeg4video_parser.c | 26 ++++++++++++++++++++++----
-+ 1 file changed, 22 insertions(+), 4 deletions(-)
-+
-+diff --git a/libavcodec/mpeg4video_parser.c b/libavcodec/mpeg4video_parser.c
-+index aa5e87a..0d8b15a 100644
-+--- a/libavcodec/mpeg4video_parser.c
-++++ b/libavcodec/mpeg4video_parser.c
-+@@ -43,18 +43,32 @@ int ff_mpeg4_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size)
-+     state     = pc->state;
-+
-+     i = 0;
-+-    if (!vop_found) {
-++    if (vop_found < 0) {
-++        for (i = 0; i < buf_size; i++) {
-++            state = (state << 8) | buf[i];
-++            if (state >= 0x100 && state <= 0x12f) {
-++                i++;
-++                vop_found = 0;
-++                break;
-++            }
-++        }
-++    }
-++
-++    if (vop_found == 0)
-++        vop_found = 1;
-++
-++    if (vop_found == 1) {
-+         for (i = 0; i < buf_size; i++) {
-+             state = (state << 8) | buf[i];
-+             if (state == 0x1B6) {
-+                 i++;
-+-                vop_found = 1;
-++                vop_found = 2;
-+                 break;
-+             }
-+         }
-+     }
-+
-+-    if (vop_found) {
-++    if (vop_found == 2) {
-+         /* EOF considered as end of frame */
-+         if (buf_size == 0)
-+             return 0;
-+@@ -133,12 +147,16 @@ static int mpeg4video_parse(AVCodecParserContext *s,
-+     ParseContext *pc = s->priv_data;
-+     int next;
-+
-++    if (pc->frame_start_found == 0 && !avctx->extradata)
-++        pc->frame_start_found = -1;
-++
-+     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
-+         next = buf_size;
-+     } else {
-+         next = ff_mpeg4_find_frame_end(pc, buf, buf_size);
-+
-+-        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-++        if (pc->frame_start_found < 0 ||
-++            ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-+             *poutbuf      = NULL;
-+             *poutbuf_size = 0;
-+             return buf_size;
-+--
-+2.1.4
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index fef5ef2..e780521 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -1,7 +1,8 @@
- include ../../Makefile.include
- include FFMPEG-VERSION
- DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch \
--  0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-+  0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch \
-+  0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -77,6 +78,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); sed -i".bak" -e "s%pkg_config_default=pkg-config%export PKG_CONFIG_LIBDIR=$(PREFIX)/lib/pkgconfig \&\& pkg_config_default=$(NATIVEPREFIX)/bin/pkg-config%" configure
- 	cd $(PLATFORM); patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
- 	cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-+	cd $(PLATFORM); patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
- 	./configure $(ffmpg_config)
-
-From 4a4b1b0427cfb3116a112d682d10c802a71f913a Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 27 Feb 2015 14:37:27 +0000
-Subject: [PATCH 61/93] ffmpeg: Add some upstream HEVC optimisations
-
----
- tools/depends/target/ffmpeg/Makefile               |    6 +-
- .../added_ARM_NEON_optimized_SAO_patches.patch     | 3328 ++++++++++++++++++++
- ...hevcdsp_ARM_NEON_optimized_epel_functions.patch |  409 +++
- 3 files changed, 3742 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch
- create mode 100644 tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch
-
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index e780521..58ec0eb 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -2,7 +2,8 @@ include ../../Makefile.include
- include FFMPEG-VERSION
- DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch \
-   0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch \
--  0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-+  0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch \
-+  hevcdsp_ARM_NEON_optimized_epel_functions.patch added_ARM_NEON_optimized_SAO_patches.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -79,6 +80,9 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
- 	cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
- 	cd $(PLATFORM); patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-+	cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
-+	cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
-+
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
- 	./configure $(ffmpg_config)
-diff --git a/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch b/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch
-new file mode 100644
-index 0000000..792b5fe
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch
-@@ -0,0 +1,3328 @@
-+From b0cb307c253d2c9f4b94a54dfc74ddb83af984cc Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Mon, 8 Dec 2014 13:24:40 +0200
-+Subject: [PATCH 1/9] added ARM NEON optimized SAO band offset
-+
-+---
-+ libavcodec/arm/Makefile            |   3 +-
-+ libavcodec/arm/hevcdsp_init_neon.c |  47 +++++++++
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 204 +++++++++++++++++++++++++++++++++++++
-+ 3 files changed, 253 insertions(+), 1 deletion(-)
-+ create mode 100644 libavcodec/arm/hevcdsp_sao_neon.S
-+
-+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-+index 6051ec8..093a2e8 100644
-+--- a/libavcodec/arm/Makefile
-++++ b/libavcodec/arm/Makefile
-+@@ -133,7 +133,8 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-+                                           arm/hevcdsp_deblock_neon.o    \
-+                                           arm/hevcdsp_epel_neon.o       \
-+                                           arm/hevcdsp_idct_neon.o       \
-+-                                          arm/hevcdsp_qpel_neon.o
-++                                          arm/hevcdsp_qpel_neon.o       \
-++                                          arm/hevcdsp_sao_neon.o
-+ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
-+ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
-+                                           arm/rv40dsp_neon.o
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 733ff08..69e2b2c 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -22,6 +22,7 @@
-+ #include "libavutil/arm/cpu.h"
-+ #include "libavcodec/hevcdsp.h"
-+ #include "hevcdsp_arm.h"
-++#include "../bit_depth_template.c"
-+ 
-+ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+@@ -43,6 +44,11 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+ 
-++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++
-+ #define PUT_PIXELS(name) \
-+     void name(int16_t *dst, uint8_t *src, \
-+                                 ptrdiff_t srcstride, int height, \
-+@@ -151,6 +157,44 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
-+     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
-+ }
-+ 
-++static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-++                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
-++{
-++    pixel *dst = (pixel *)_dst;
-++    pixel *src = (pixel *)_src;
-++    int8_t offset_table[32] = { 0 };
-++    int k, y, x;
-++    int shift  = 3; // BIT_DEPTH - 5
-++
-++    stride_src /= sizeof(pixel);
-++    stride_dst /= sizeof(pixel);
-++
-++    for (k = 0; k < 4; k++)
-++        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-++
-++    switch(width){
-++    case 8:
-++        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        break;
-++    case 16:
-++        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        break;
-++    case 32:
-++        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        break;
-++    case 64:
-++        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        break;
-++    default:
-++        for (y = 0; y < height; y++) {
-++            for (x = 0; x < width; x++)
-++                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-++            dst += stride_dst;
-++            src += stride_src;
-++        }
-++    }
-++}
-++
-+ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+ {
-+     if (bit_depth == 8) {
-+@@ -170,6 +214,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-+         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
-+         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
-++        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-++          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-++        }
-+         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
-+         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
-+         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+new file mode 100644
-+index 0000000..1f0ad64
-+--- /dev/null
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -0,0 +1,204 @@
-++/*
-++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-++ *
-++ * This file is part of FFmpeg.
-++ *
-++ * FFmpeg is free software; you can redistribute it and/or
-++ * modify it under the terms of the GNU Lesser General Public
-++ * License as published by the Free Software Foundation; either
-++ * version 2.1 of the License, or (at your option) any later version.
-++ *
-++ * FFmpeg is distributed in the hope that it will be useful,
-++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-++ * Lesser General Public License for more details.
-++ *
-++ * You should have received a copy of the GNU Lesser General Public
-++ * License along with FFmpeg; if not, write to the Free Software
-++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-++ */
-++
-++#include "libavutil/arm/asm.S"
-++#include "neon.S"
-++
-++function ff_hevc_sao_band_w8_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // offset_table
-++        vpush {d8-d15}
-++        vld1.8  {q0, q1}, [r5] // offset table
-++
-++1:      subs    r4, #1
-++        vld1.8   {d24}, [r1], r3
-++        vshr.u8  d16, d24, #3
-++        vtbl.8   d16, {q0, q1}, d16
-++        vmovl.s8 q2, d16
-++        vmovl.u8 q6, d24
-++        vadd.s16 q2, q6
-++        vqmovun.s16 d4, q2
-++        vst1.8  {d4}, [r0], r2
-++        bne    1b
-++
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w16_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // offset_table
-++        vpush {d8-d15}
-++        vld1.8  {q0, q1}, [r5] // offset table
-++
-++1:      subs    r4, #1
-++        vld1.8  {q12}, [r1], r3
-++
-++        vshr.u8   q8, q12, #3
-++
-++        vtbl.8  d16, {q0, q1}, d16
-++        vtbl.8  d17, {q0, q1}, d17
-++
-++        vmovl.s8 q2, d16
-++        vmovl.s8 q3, d17
-++
-++        vmovl.u8 q6, d24
-++        vmovl.u8 q7, d25
-++
-++        vadd.s16 q2, q6
-++        vadd.s16 q3, q7
-++
-++        vqmovun.s16 d4, q2
-++        vqmovun.s16 d5, q3
-++
-++        vstm.8   r0, {q2}
-++        add    r0, r2
-++        bne    1b
-++
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // offset_table
-++        vpush {d8-d15}
-++        vld1.8  {q0, q1}, [r5] // offset table
-++
-++1:      subs    r4, #1
-++        vld1.8  {q12-q13}, [r1], r3
-++
-++        vshr.u8   q8, q12, #3
-++        vshr.u8   q9, q13, #3
-++
-++        vtbl.8  d16, {q0, q1}, d16
-++        vtbl.8  d17, {q0, q1}, d17
-++        vtbl.8  d18, {q0, q1}, d18
-++        vtbl.8  d19, {q0, q1}, d19
-++
-++        vmovl.s8 q2, d16
-++        vmovl.s8 q3, d17 // q8 free
-++        vmovl.s8 q4, d18
-++        vmovl.s8 q5, d19 // q9 free
-++
-++        vmovl.u8 q6, d24
-++        vmovl.u8 q7, d25 // q12 free
-++        vmovl.u8 q8, d26
-++        vmovl.u8 q9, d27 // q13 free
-++
-++        vadd.s16 q2, q6
-++        vadd.s16 q3, q7
-++        vadd.s16 q4, q8
-++        vadd.s16 q5, q9
-++
-++        vqmovun.s16 d4, q2
-++        vqmovun.s16 d5, q3
-++        vqmovun.s16 d6, q4 // q4 free
-++        vqmovun.s16 d7, q5 // q5 free
-++
-++        vst1.8 {q2-q3}, [r0], r2
-++        bne    1b
-++
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // offset_table
-++        vpush {d8-d15}
-++        vld1.8  {q0, q1}, [r5] // offset table
-++
-++1:      subs    r4, #1
-++        vld1.8  {q12-q13}, [r1]!
-++        vld1.8  {q14-q15}, [r1], r3
-++        sub     r1, #32
-++
-++        vshr.u8   q8, q12, #3
-++        vshr.u8   q9, q13, #3
-++        vshr.u8  q10, q14, #3
-++        vshr.u8  q11, q15, #3
-++
-++        vtbl.8  d16, {q0, q1}, d16
-++        vtbl.8  d17, {q0, q1}, d17
-++        vtbl.8  d18, {q0, q1}, d18
-++        vtbl.8  d19, {q0, q1}, d19
-++        vtbl.8  d20, {q0, q1}, d20
-++        vtbl.8  d21, {q0, q1}, d21
-++        vtbl.8  d22, {q0, q1}, d22
-++        vtbl.8  d23, {q0, q1}, d23
-++
-++        vmovl.s8 q2, d16
-++        vmovl.s8 q3, d17 // q8 free
-++        vmovl.s8 q4, d18
-++        vmovl.s8 q5, d19 // q9 free
-++
-++        vmovl.u8 q6, d24
-++        vmovl.u8 q7, d25 // q12 free
-++        vmovl.u8 q8, d26
-++        vmovl.u8 q9, d27 // q13 free
-++
-++        vadd.s16 q2, q6
-++        vadd.s16 q3, q7
-++        vadd.s16 q4, q8
-++        vadd.s16 q5, q9
-++
-++        vqmovun.s16 d4, q2
-++        vqmovun.s16 d5, q3
-++        vqmovun.s16 d6, q4 // q4 free
-++        vqmovun.s16 d7, q5 // q5 free
-++
-++        // free q4 -q9, q12 - q13
-++        vmovl.s8 q4, d20
-++        vmovl.s8 q5, d21 // q10 free
-++        vmovl.s8 q6, d22
-++        vmovl.s8 q7, d23 // q11 free
-++
-++        vmovl.u8  q8, d28
-++        vmovl.u8  q9, d29 // q14 free
-++        vmovl.u8 q10, d30
-++        vmovl.u8 q11, d31 // q15 free
-++
-++        vadd.s16 q4, q8
-++        vadd.s16 q5, q9
-++        vadd.s16 q6, q10
-++        vadd.s16 q7, q11
-++
-++        vqmovun.s16  d8, q4
-++        vqmovun.s16  d9, q5
-++        vqmovun.s16 d10, q6
-++        vqmovun.s16 d11, q7
-++
-++        vstm.8   r0, {q2-q5}
-++        add    r0, r2
-++        bne    1b
-++
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-+-- 
-+2.5.0
-+
-+
-+From 8429b1de64bb871d57651ecfe3b084e2dfe0af51 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Wed, 27 May 2015 18:10:20 +0100
-+Subject: [PATCH 2/9] added NEON optimized sao edge for eo1 width 64
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  47 ++++++++++++
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 147 +++++++++++++++++++++++++++++++++++++
-+ 2 files changed, 194 insertions(+)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 69e2b2c..c7b5404 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -22,6 +22,7 @@
-+ #include "libavutil/arm/cpu.h"
-+ #include "libavcodec/hevcdsp.h"
-+ #include "hevcdsp_arm.h"
-++#include "libavcodec/avcodec.h"
-+ #include "../bit_depth_template.c"
-+ 
-+ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+@@ -48,6 +49,7 @@ void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_d
-+ void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+ void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+ void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ 
-+ #define PUT_PIXELS(name) \
-+     void name(int16_t *dst, uint8_t *src, \
-+@@ -195,6 +197,50 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+     }
-+ }
-+ 
-++#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-++static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-++                                          int16_t *_sao_offset_val, int eo, int width, int height)
-++{
-++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-++    static const int8_t pos[4][2][2] = {
-++        { { -1,  0 }, {  1, 0 } }, // horizontal
-++        { {  0, -1 }, {  0, 1 } }, // vertical
-++        { { -1, -1 }, {  1, 1 } }, // 45 degree
-++        { {  1, -1 }, { -1, 1 } }, // 135 degree
-++    };
-++    int8_t sao_offset_val[8];  // padding of 3 for vld
-++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
-++    pixel *dst = (pixel *)_dst;
-++    pixel *src = (pixel *)_src;
-++    int a_stride, b_stride;
-++    int x, y;
-++
-++    for (x = 0; x < 5; x++) {
-++        sao_offset_val[x] = _sao_offset_val[x];
-++    }
-++
-++    stride_src /= sizeof(pixel);
-++    stride_dst /= sizeof(pixel);
-++
-++    if (eo == 1 && width == 64) {
-++        ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++    } else {
-++        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-++        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-++        for (y = 0; y < height; y++) {
-++            for (x = 0; x < width; x++) {
-++                int diff0         = CMP(src[x], src[x + a_stride]);
-++                int diff1         = CMP(src[x], src[x + b_stride]);
-++                int offset_val    = edge_idx[2 + diff0 + diff1];
-++                dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-++            }
-++            src += stride_src;
-++            dst += stride_dst;
-++        }
-++    }
-++}
-++#undef CMP
-++
-+ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+ {
-+     if (bit_depth == 8) {
-+@@ -216,6 +262,7 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
-+         for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-+           c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-++          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
-+         }
-+         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
-+         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 1f0ad64..5ec2de9 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -202,3 +202,150 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+         bx lr
-+ endfunc
-+ 
-++function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x02
-++        vpush {d8-d15}
-++1:      subs    r4, #1
-++        // load a
-++        sub     r1, r3
-++        vld1.8  {q0-q1}, [r1]!
-++        vld1.8  {q2-q3}, [r1], r3
-++        sub     r1, #32
-++        // load c
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++        // load b
-++        vld1.8  {q8-q9}, [r1]!
-++        vld1.8  {q10-q11}, [r1], r3
-++        sub     r1, #32
-++
-++        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-++        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-++        vcgt.u8 q13, q5, q1
-++        vcgt.u8 q1,  q1, q5
-++        vcgt.u8 q14, q6, q2
-++        vcgt.u8 q2,  q2, q6
-++        vcgt.u8 q15, q7, q3
-++        vcgt.u8 q3,  q3, q7
-++
-++        vsub.s8 q12, q0, q12 // diff0
-++        vsub.s8 q13, q1, q13
-++        vsub.s8 q14, q2, q14
-++        vsub.s8 q15, q3, q15
-++
-++        vcgt.u8  q0,  q4, q8 // c > b
-++        vcgt.u8  q8,  q8, q4 // b > c
-++        vcgt.u8  q1,  q5, q9
-++        vcgt.u8  q9,  q9, q5
-++        vcgt.u8  q2,  q6, q10
-++        vcgt.u8 q10, q10, q6
-++        vcgt.u8  q3,  q7, q11
-++        vcgt.u8 q11, q11, q7
-++
-++        vsub.s8 q0, q8, q0 // diff1
-++        vsub.s8 q1, q9, q1
-++        vsub.s8 q2, q10, q2
-++        vsub.s8 q3, q11, q3
-++
-++        veor.u8 q8, q8  // zero register
-++        vdup.s8 q9, r6  // 2 to all elements
-++        add     r6, #1
-++        vdup.s8 q10, r6 // 3 to all elements
-++        sub     r6, #1
-++
-++        vadd.s8 q0, q12 //diff0 + diff1
-++        vadd.s8 q1, q13
-++        vadd.s8 q2, q14
-++        vadd.s8 q3, q15
-++
-++        vcgt.s8 q4, q0, q8 // diff0 + diff1 > 0
-++        vcgt.s8 q5, q1, q8
-++        vcgt.s8 q6, q2, q8
-++        vcgt.s8 q7, q3, q8
-++
-++        vclt.s8 q11, q0, q8 // diff0 + diff1 < 0
-++        vclt.s8 q12, q1, q8
-++        vclt.s8 q13, q2, q8
-++        vclt.s8 q14, q3, q8
-++
-++        vadd.s8  q8,  q0, q9  // diff0 + diff1 + 2
-++        vand.8  q15,  q8, q4
-++        vadd.s8  q8,  q0, q10 // diff0 + diff1 + 3
-++        vand.8   q8,  q8, q11
-++        vadd.s8  q0, q15, q8  // offset_idx
-++
-++        vadd.s8  q8,  q1, q9  // diff0 + diff1 + 2
-++        vand.8  q15,  q8, q5
-++        vadd.s8  q8,  q1, q10 // diff0 + diff1 + 3
-++        vand.8   q8,  q8, q12
-++        vadd.s8  q1, q15, q8  // offset_idx
-++
-++        vadd.s8  q8,  q2, q9  // diff0 + diff1 + 2 + 2
-++        vand.8  q15,  q8, q6
-++        vadd.s8  q8,  q2, q10 // diff0 + diff1 + 2 + 3
-++        vand.8   q8,  q8, q13
-++        vadd.s8  q2, q15, q8  // offset_idx
-++
-++        vadd.s8  q8,  q3, q9  // diff0 + diff1 + 2 + 2
-++        vand.8  q15,  q8, q7
-++        vadd.s8  q8,  q3, q10 // diff0 + diff1 + 2 + 3
-++        vand.8   q8,  q8, q14
-++        vadd.s8  q3, q15, q8  // offset_idx
-++        // TODO: load only once
-++        vld1.8   d16, [r5]
-++
-++        vtbl.8   d0, {d16}, d0
-++        vtbl.8   d1, {d16}, d1
-++        vtbl.8   d2, {d16}, d2
-++        vtbl.8   d3, {d16}, d3
-++        vtbl.8   d4, {d16}, d4
-++        vtbl.8   d5, {d16}, d5
-++        vtbl.8   d6, {d16}, d6
-++        vtbl.8   d7, {d16}, d7
-++
-++        // TODO: load only once
-++        // load c again
-++        sub     r1, r3
-++        sub     r1, r3
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++
-++        vmovl.u8   q8, d8
-++        vmovl.u8   q9, d9
-++        vmovl.u8  q10, d10
-++        vmovl.u8  q11, d11
-++        vmovl.u8  q12, d12
-++        vmovl.u8  q13, d13
-++        vmovl.u8  q14, d14
-++        vmovl.u8  q15, d15
-++
-++        vaddw.s8  q8, d0
-++        vaddw.s8  q9, d1
-++        vaddw.s8 q10, d2
-++        vaddw.s8 q11, d3
-++        vaddw.s8 q12, d4
-++        vaddw.s8 q13, d5
-++        vaddw.s8 q14, d6
-++        vaddw.s8 q15, d7
-++
-++        vqmovun.s16  d0, q8
-++        vqmovun.s16  d1, q9
-++        vqmovun.s16  d2, q10
-++        vqmovun.s16  d3, q11
-++        vqmovun.s16  d4, q12
-++        vqmovun.s16  d5, q13
-++        vqmovun.s16  d6, q14
-++        vqmovun.s16  d7, q15
-++
-++        vstm r0, {q0-q3}
-++        add  r0, r2
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-+-- 
-+2.5.0
-+
-+
-+From 402e2bd1c5ad659c757bf9734abe6331904fb9e2 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Tue, 16 Dec 2014 16:28:25 +0200
-+Subject: [PATCH 3/9] Added SAO edge offset for ARM NEON w32 and w64
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  46 +++-
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 510 +++++++++++++++++++++++++++++++------
-+ 2 files changed, 474 insertions(+), 82 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index c7b5404..c32940e 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -49,7 +49,16 @@ void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_d
-+ void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+ void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+ void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++
-++void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++
-++void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ 
-+ #define PUT_PIXELS(name) \
-+     void name(int16_t *dst, uint8_t *src, \
-+@@ -222,9 +231,40 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+     stride_src /= sizeof(pixel);
-+     stride_dst /= sizeof(pixel);
-+ 
-+-    if (eo == 1 && width == 64) {
-+-        ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+-    } else {
-++    switch (width) {
-++    case 32:
-++        switch(eo) {
-++        case 0:
-++            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 1:
-++            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 2:
-++            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 3:
-++            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        }
-++        break;
-++    case 64:
-++        switch(eo) {
-++        case 0:
-++            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 1:
-++            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 2:
-++            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 3:
-++            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        }
-++        break;
-++    default:
-+         a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+         b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+         for (y = 0; y < height; y++) {
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 5ec2de9..4687012 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -202,27 +202,7 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+         bx lr
-+ endfunc
-+ 
-+-function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x02
-+-        vpush {d8-d15}
-+-1:      subs    r4, #1
-+-        // load a
-+-        sub     r1, r3
-+-        vld1.8  {q0-q1}, [r1]!
-+-        vld1.8  {q2-q3}, [r1], r3
-+-        sub     r1, #32
-+-        // load c
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-+-        sub     r1, #32
-+-        // load b
-+-        vld1.8  {q8-q9}, [r1]!
-+-        vld1.8  {q10-q11}, [r1], r3
-+-        sub     r1, #32
-+-
-++.macro edge_w64_body
-+         vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-+         vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-+         vcgt.u8 q13, q5, q1
-+@@ -251,69 +231,61 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+         vsub.s8 q2, q10, q2
-+         vsub.s8 q3, q11, q3
-+ 
-+-        veor.u8 q8, q8  // zero register
-+-        vdup.s8 q9, r6  // 2 to all elements
-+-        add     r6, #1
-+-        vdup.s8 q10, r6 // 3 to all elements
-+-        sub     r6, #1
-+-
-+         vadd.s8 q0, q12 //diff0 + diff1
-+         vadd.s8 q1, q13
-+         vadd.s8 q2, q14
-+         vadd.s8 q3, q15
-+ 
-+-        vcgt.s8 q4, q0, q8 // diff0 + diff1 > 0
-+-        vcgt.s8 q5, q1, q8
-+-        vcgt.s8 q6, q2, q8
-+-        vcgt.s8 q7, q3, q8
-+-
-+-        vclt.s8 q11, q0, q8 // diff0 + diff1 < 0
-+-        vclt.s8 q12, q1, q8
-+-        vclt.s8 q13, q2, q8
-+-        vclt.s8 q14, q3, q8
-+-
-+-        vadd.s8  q8,  q0, q9  // diff0 + diff1 + 2
-+-        vand.8  q15,  q8, q4
-+-        vadd.s8  q8,  q0, q10 // diff0 + diff1 + 3
-+-        vand.8   q8,  q8, q11
-+-        vadd.s8  q0, q15, q8  // offset_idx
-+-
-+-        vadd.s8  q8,  q1, q9  // diff0 + diff1 + 2
-+-        vand.8  q15,  q8, q5
-+-        vadd.s8  q8,  q1, q10 // diff0 + diff1 + 3
-+-        vand.8   q8,  q8, q12
-+-        vadd.s8  q1, q15, q8  // offset_idx
-+-
-+-        vadd.s8  q8,  q2, q9  // diff0 + diff1 + 2 + 2
-+-        vand.8  q15,  q8, q6
-+-        vadd.s8  q8,  q2, q10 // diff0 + diff1 + 2 + 3
-+-        vand.8   q8,  q8, q13
-+-        vadd.s8  q2, q15, q8  // offset_idx
-+-
-+-        vadd.s8  q8,  q3, q9  // diff0 + diff1 + 2 + 2
-+-        vand.8  q15,  q8, q7
-+-        vadd.s8  q8,  q3, q10 // diff0 + diff1 + 2 + 3
-+-        vand.8   q8,  q8, q14
-+-        vadd.s8  q3, q15, q8  // offset_idx
-+-        // TODO: load only once
-+-        vld1.8   d16, [r5]
-+-
-+-        vtbl.8   d0, {d16}, d0
-+-        vtbl.8   d1, {d16}, d1
-+-        vtbl.8   d2, {d16}, d2
-+-        vtbl.8   d3, {d16}, d3
-+-        vtbl.8   d4, {d16}, d4
-+-        vtbl.8   d5, {d16}, d5
-+-        vtbl.8   d6, {d16}, d6
-+-        vtbl.8   d7, {d16}, d7
-+-
-+-        // TODO: load only once
-+-        // load c again
-+-        sub     r1, r3
-+-        sub     r1, r3
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-+-        sub     r1, #32
-++        vdup.s8 q9, r6 // 3 to all elements
-++        sub     r6, #1
-++
-++        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-++        vclt.s8 q13, q1, #0
-++        vclt.s8 q14, q2, #0
-++        vclt.s8 q15, q3, #0
-++
-++        vadd.s8  q8,  q0, q9 // diff0 + diff1 + 3
-++        vadd.s8  q10,  q1, q9
-++        vand.8   q12, q8, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-++        vand.8   q13, q10, q13
-++        vadd.s8  q8,  q2, q9
-++        vadd.s8  q10,  q3, q9
-++        vand.8   q14, q8, q14
-++        vand.8   q15, q10, q15
-++
-++        vdup.s8 q9, r6  // 2 to all elements
-++        add     r6, #1
-++
-++        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-++        vadd.s8   q8, q0, q9 // diff0 + diff1 + 2
-++        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vcgt.s8  q10, q1, #0
-++        vadd.s8   q0, q11, q12  // offset_idx
-++
-++        vadd.s8   q8, q1, q9 // diff0 + diff1 + 2
-++        vcgt.s8  q12, q2, #0
-++        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vadd.s8   q8, q2, q9 // diff0 + diff1 + 2
-++        vadd.s8   q1, q11, q13
-++
-++        vand.8   q11, q8, q12 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vcgt.s8  q10, q3, #0
-++        vadd.s8   q2, q11, q14
-++
-++        vadd.s8   q8, q3, q9 // diff0 + diff1 + 2
-++        vmov.32  d18[0], r7  // load offset table from general registers
-++        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vmov.32  d18[1], r5  // load rest of offset table
-++        vadd.s8   q3, q11, q15
-++
-++        vtbl.8   d0, {d18}, d0
-++        vtbl.8   d1, {d18}, d1
-++        vtbl.8   d2, {d18}, d2
-++        vtbl.8   d3, {d18}, d3
-++        vtbl.8   d4, {d18}, d4
-++        vtbl.8   d5, {d18}, d5
-++        vtbl.8   d6, {d18}, d6
-++        vtbl.8   d7, {d18}, d7
-+ 
-+         vmovl.u8   q8, d8
-+         vmovl.u8   q9, d9
-+@@ -344,8 +316,388 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+ 
-+         vstm r0, {q0-q3}
-+         add  r0, r2
-++.endm
-++
-++.macro edge_w32_body
-++        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-++        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-++        vcgt.u8 q13, q5, q1
-++        vcgt.u8 q1,  q1, q5
-++
-++        vsub.s8 q12, q0, q12 // diff0
-++        vcgt.u8  q0,  q4, q8 // c > b
-++        vsub.s8 q13, q1, q13 // diff0 part 2
-++
-++        vcgt.u8  q6,  q8, q4 // b > c
-++        vcgt.u8  q1,  q5, q9
-++        vcgt.u8  q7,  q9, q5
-++
-++        vsub.s8 q0, q6, q0 // diff1
-++        vsub.s8 q1, q7, q1 // diff1 part 2
-++        vadd.s8 q0, q12 //diff0 + diff1
-++
-++        vdup.s8 q7, r6 // 3 to all elements
-++        sub     r6, #1
-++        vadd.s8 q1, q13
-++
-++        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-++        vclt.s8 q13, q1, #0
-++
-++        vadd.s8  q6,  q0, q7 // diff0 + diff1 + 3
-++        vadd.s8  q10,  q1, q7
-++        vdup.s8 q7, r6  // 2 to all elements
-++        add     r6, #1
-++        vand.8   q12, q6, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-++        vand.8   q13, q10, q13
-++
-++
-++        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-++        vadd.s8   q6, q0, q7 // diff0 + diff1 + 2
-++        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vcgt.s8  q10, q1, #0
-++        vadd.s8   q0, q11, q12  // offset_idx
-++
-++        vadd.s8   q6, q1, q7 // diff0 + diff1 + 2
-++        vmov.32  d14[0], r7  // load offset table from general registers
-++        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vmov.32  d14[1], r5  // load rest of offset table
-++        vadd.s8   q1, q11, q13
-++
-++        vtbl.8   d0, {d14}, d0
-++        vtbl.8   d1, {d14}, d1
-++        vtbl.8   d2, {d14}, d2
-++        vtbl.8   d3, {d14}, d3
-++
-++        vmovl.u8   q6, d8
-++        vmovl.u8   q7, d9
-++        vmovl.u8  q10, d10
-++        vmovl.u8  q11, d11
-++
-++        vaddw.s8  q6, d0
-++        vaddw.s8  q7, d1
-++        vaddw.s8 q10, d2
-++        vaddw.s8 q11, d3
-++
-++        vqmovun.s16  d0, q6
-++        vqmovun.s16  d1, q7
-++        vqmovun.s16  d2, q10
-++        vqmovun.s16  d3, q11
-++
-++        vstm r0, {q0-q1}
-++        add  r0, r2
-++.endm
-++
-++function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        sub    r1, #8
-++1:      subs    r4, #1
-++        vld1.64  {q10-q11}, [r1]!
-++        vld1.64  {q12-q13}, [r1]!
-++        vld1.64  {q14}, [r1], r3
-++        sub      r1, #64
-++        // load a
-++        vext.8 q0, q10, q11, #7
-++        vext.8 q1, q11, q12, #7
-++        vext.8 q2, q12, q13, #7
-++        vext.8 q3, q13, q14, #7
-++        // load c
-++        vext.8 q4, q10, q11, #8
-++        vext.8 q5, q11, q12, #8
-++        vext.8 q6, q12, q13, #8
-++        vext.8 q7, q13, q14, #8
-++        // load b
-++        vext.8 q8, q10, q11, #9
-++        vext.8 q9, q11, q12, #9
-++        vext.8 q10, q12, q13, #9
-++        vext.8 q11, q13, q14, #9
-++        edge_w64_body
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        sub     r1, r3
-++        // load a
-++        vld1.8  {q0-q1}, [r1]!
-++        vld1.8  {q2-q3}, [r1], r3
-++        sub     r1, #32
-++1:      subs    r4, #1
-++        // load c
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++        // load b
-++        vld1.8  {q8-q9}, [r1]!
-++        vld1.8  {q10-q11}, [r1]
-++        sub     r1, #32
-++        edge_w64_body
-++        // copy c to a
-++        vmov.64 q0, q4
-++        vmov.64 q1, q5
-++        vmov.64 q2, q6
-++        vmov.64 q3, q7
-+         bne   1b
-+         vpop  {d8-d15}
-+         pop   {r4-r8}
-+         bx lr
-+ endfunc
-++
-++function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++1:      sub     r1, r3
-++        // load a
-++        // TODO: fix unaligned load
-++        //       don't reload a like in eo1
-++        sub     r1, #1
-++        vld1.8  {q0-q1}, [r1]!
-++        vld1.8  {q2-q3}, [r1], r3
-++        sub     r1, #31
-++        subs    r4, #1
-++        // load c
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++        // load b
-++        add     r1, #1
-++        vld1.8  {q8-q9}, [r1]!
-++        vld1.8  {q10-q11}, [r1]
-++        sub     r1, #33
-++        edge_w64_body
-++        // copy c to a
-++        vmov.64 q0, q4
-++        vmov.64 q1, q5
-++        vmov.64 q2, q6
-++        vmov.64 q3, q7
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++1:      sub     r1, r3
-++        // load a
-++        // TODO: fix unaligned load
-++        //       don't reload a like in eo1
-++        add     r1, #1
-++        vld1.8  {q0-q1}, [r1]!
-++        vld1.8  {q2-q3}, [r1], r3
-++        sub     r1, #33
-++        subs    r4, #1
-++        // load c
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++        // load b
-++        sub     r1, #1
-++        vld1.8  {q8-q9}, [r1]!
-++        vld1.8  {q10-q11}, [r1]
-++        sub     r1, #31
-++        edge_w64_body
-++        // copy c to a
-++        vmov.64 q0, q4
-++        vmov.64 q1, q5
-++        vmov.64 q2, q6
-++        vmov.64 q3, q7
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        sub    r1, #8 // load 8 extra bytes
-++1:      subs    r4, #1
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3 // only first 9 bytes are used
-++        sub    r1, #32
-++        // a
-++        vext.8  q0, q10, q11, #7
-++        vext.8  q1, q11, q12, #7
-++        // c
-++        vext.8  q4, q10, q11, #8
-++        vext.8  q5, q11, q12, #8
-++        // b
-++        vext.8  q8, q10, q11, #9
-++        vext.8  q9, q11, q12, #9
-++        edge_w32_body
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        // load a
-++        sub     r1, r3
-++        vld1.8  {q0-q1}, [r1], r3
-++        // load c
-++        vld1.8  {q4-q5}, [r1], r3
-++1:      subs    r4, #1
-++        // load b
-++        vld1.8  {q8-q9}, [r1], r3
-++        edge_w32_body
-++        // inputs for next loop iteration
-++        // a
-++        vmov.64 q0, q4
-++        vmov.64 q1, q5
-++        // c
-++        vmov.64 q4, q8
-++        vmov.64 q5, q9
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        // load a
-++        sub     r1, r3
-++        sub    r1, #8
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q0, q10, q11, #7
-++        vext.8  q1, q11, q12, #7
-++        // load c
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q4, q10, q11, #8
-++        vext.8  q5, q11, q12, #8
-++        vext.8  q2, q10, q11, #7
-++1:      subs    r4, #1
-++        // load b
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q8, q10, q11, #9
-++        vext.8  q9, q11, q12, #9
-++        vext.8  q14, q10, q11, #8
-++        vext.8  q15, q11, q12, #8
-++        vext.8  q3, q10, q11, #7
-++        edge_w32_body
-++        // inputs for next loop iteration
-++        // a
-++        vmov.8 q0, q2
-++        vext.8 q1, q4, q5, #15
-++        // c
-++        vmov.8  q4, q14
-++        vmov.8  q5, q15
-++        vmov.8  q2, q3
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        sub    r1, r3
-++        ldr    r5, [r5]
-++        sub    r1, #8
-++        vpush {d8-d15}
-++        // load a
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q0, q10, q11, #9
-++        vext.8  q1, q11, q12, #9
-++        // load c
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q4, q10, q11, #8
-++        vext.8  q5, q11, q12, #8
-++        vext.8  q2, q12, q11, #8
-++1:      subs    r4, #1
-++        // load b
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q8, q10, q11, #7
-++        vext.8  q9, q11, q12, #7
-++        vext.8  q3, q12, q10, #7
-++        edge_w32_body
-++        // inputs for next loop iteration
-++        // a
-++        vext.8 q0, q4, q5, #1
-++        vext.8 q1, q5, q2, #1
-++        // c
-++        vext.8  q4, q8, q9, #1
-++        vext.8  q5, q9, q3, #1
-++        vext.8  q2, q3, q1, #1
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-+-- 
-+2.5.0
-+
-+
-+From 1898d052a73370166d57e17cc7c52b7275887df3 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Fri, 19 Dec 2014 09:44:10 +0200
-+Subject: [PATCH 4/9] Improved SAO band NEON opimizations made SAO buffer 16
-+ byte aligned added alignment hints to loads and stores optimized register
-+ usage in SAO band neon assembly
-+
-+---
-+ libavcodec/arm/hevcdsp_sao_neon.S | 212 +++++++++++++++-----------------------
-+ 1 file changed, 82 insertions(+), 130 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 4687012..ac21013 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -22,120 +22,84 @@
-+ #include "neon.S"
-+ 
-+ function ff_hevc_sao_band_w8_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // offset_table
-+-        vpush {d8-d15}
-+-        vld1.8  {q0, q1}, [r5] // offset table
-++        ldr      r12, [sp, #4]    // offset_table address
-++        vld1.8   {q0, q1}, [r12]  // offset table
-++        ldr      r12, [sp, #0]    // height
-+ 
-+-1:      subs    r4, #1
-+-        vld1.8   {d24}, [r1], r3
-++1:      subs     r12, #1
-++        vld1.8   {d24}, [r1,:64], r3
-+         vshr.u8  d16, d24, #3
-+         vtbl.8   d16, {q0, q1}, d16
-+-        vmovl.s8 q2, d16
-+         vmovl.u8 q6, d24
-+-        vadd.s16 q2, q6
-++        vaddw.s8 q6, d16
-+         vqmovun.s16 d4, q2
-+-        vst1.8  {d4}, [r0], r2
-++        vst1.8  {d4}, [r0,:64], r2
-+         bne    1b
-+ 
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_band_w16_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // offset_table
-+-        vpush {d8-d15}
-+-        vld1.8  {q0, q1}, [r5] // offset table
-+-
-+-1:      subs    r4, #1
-+-        vld1.8  {q12}, [r1], r3
-++        ldr      r12, [sp, #4]    // offset_table address
-++        vld1.8   {q0, q1}, [r12]  // offset table
-++        ldr      r12, [sp, #0]    // height
-+ 
-++1:      subs     r12, #1
-++        vld1.8  {q12}, [r1,:128], r3
-+         vshr.u8   q8, q12, #3
-+-
-+         vtbl.8  d16, {q0, q1}, d16
-+         vtbl.8  d17, {q0, q1}, d17
-+-
-+-        vmovl.s8 q2, d16
-+-        vmovl.s8 q3, d17
-+-
-+-        vmovl.u8 q6, d24
-+-        vmovl.u8 q7, d25
-+-
-+-        vadd.s16 q2, q6
-+-        vadd.s16 q3, q7
-+-
-+-        vqmovun.s16 d4, q2
-+-        vqmovun.s16 d5, q3
-+-
-+-        vstm.8   r0, {q2}
-+-        add    r0, r2
-++        vmovl.u8 q10, d24
-++        vmovl.u8 q11, d25
-++        vaddw.s8 q10, d16
-++        vaddw.s8 q11, d17
-++        vqmovun.s16 d4, q10
-++        vqmovun.s16 d5, q11
-++        vst1.8   {q2}, [r0,:128], r2
-+         bne    1b
-+ 
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_band_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // offset_table
-+-        vpush {d8-d15}
-+-        vld1.8  {q0, q1}, [r5] // offset table
-+-
-+-1:      subs    r4, #1
-+-        vld1.8  {q12-q13}, [r1], r3
-+-
-+-        vshr.u8   q8, q12, #3
-+-        vshr.u8   q9, q13, #3
-+-
-+-        vtbl.8  d16, {q0, q1}, d16
-+-        vtbl.8  d17, {q0, q1}, d17
-+-        vtbl.8  d18, {q0, q1}, d18
-+-        vtbl.8  d19, {q0, q1}, d19
-+-
-+-        vmovl.s8 q2, d16
-+-        vmovl.s8 q3, d17 // q8 free
-+-        vmovl.s8 q4, d18
-+-        vmovl.s8 q5, d19 // q9 free
-+-
-+-        vmovl.u8 q6, d24
-+-        vmovl.u8 q7, d25 // q12 free
-+-        vmovl.u8 q8, d26
-+-        vmovl.u8 q9, d27 // q13 free
-+-
-+-        vadd.s16 q2, q6
-+-        vadd.s16 q3, q7
-+-        vadd.s16 q4, q8
-+-        vadd.s16 q5, q9
-+-
-+-        vqmovun.s16 d4, q2
-+-        vqmovun.s16 d5, q3
-+-        vqmovun.s16 d6, q4 // q4 free
-+-        vqmovun.s16 d7, q5 // q5 free
-+-
-+-        vst1.8 {q2-q3}, [r0], r2
-+-        bne    1b
-+-
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        ldr      r12, [sp, #4]    // offset_table address
-++        vld1.8   {q0, q1}, [r12]  // offset table
-++        ldr      r12, [sp, #0]    // height
-++
-++1:      subs     r12, #1
-++        vld1.8   {q2-q3}, [r1,:128], r3
-++        vshr.u8  q8, q2, #3
-++        vshr.u8  q9, q3, #3
-++        vtbl.8   d16, {q0, q1}, d16
-++        vtbl.8   d17, {q0, q1}, d17
-++        vtbl.8   d18, {q0, q1}, d18
-++        vtbl.8   d19, {q0, q1}, d19
-++        vmovl.u8 q12, d4
-++        vmovl.u8 q13, d5
-++        vmovl.u8 q14, d6
-++        vmovl.u8 q15, d7
-++        vaddw.s8 q12, d16
-++        vaddw.s8 q13, d17
-++        vaddw.s8 q14, d18
-++        vaddw.s8 q15, d19
-++        vqmovun.s16 d4, q12
-++        vqmovun.s16 d5, q13
-++        vqmovun.s16 d6, q14
-++        vqmovun.s16 d7, q15
-++        vst1.8   {q2-q3}, [r0,:128], r2
-++        bne      1b
-++
-++        bx       lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_band_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // offset_table
-+-        vpush {d8-d15}
-+-        vld1.8  {q0, q1}, [r5] // offset table
-++        ldr      r12, [sp, #4]    // offset_table address
-++        vld1.8   {q0, q1}, [r12]  // offset table
-++        ldr      r12, [sp, #0]    // height
-+ 
-+-1:      subs    r4, #1
-+-        vld1.8  {q12-q13}, [r1]!
-+-        vld1.8  {q14-q15}, [r1], r3
-++1:      subs     r12, #1
-++        vld1.8  {q12-q13}, [r1,:128]!
-++        vld1.8  {q14-q15}, [r1,:128], r3
-+         sub     r1, #32
-+ 
-+         vshr.u8   q8, q12, #3
-+@@ -152,53 +116,41 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+         vtbl.8  d22, {q0, q1}, d22
-+         vtbl.8  d23, {q0, q1}, d23
-+ 
-+-        vmovl.s8 q2, d16
-+-        vmovl.s8 q3, d17 // q8 free
-+-        vmovl.s8 q4, d18
-+-        vmovl.s8 q5, d19 // q9 free
-++        vmovl.u8 q2, d24
-++        vmovl.u8 q3, d25
-++        vmovl.u8 q12, d26
-++        vmovl.u8 q13, d27
-+ 
-+-        vmovl.u8 q6, d24
-+-        vmovl.u8 q7, d25 // q12 free
-+-        vmovl.u8 q8, d26
-+-        vmovl.u8 q9, d27 // q13 free
-+-
-+-        vadd.s16 q2, q6
-+-        vadd.s16 q3, q7
-+-        vadd.s16 q4, q8
-+-        vadd.s16 q5, q9
-++        vaddw.s8 q2, d16
-++        vaddw.s8 q3, d17
-++        vaddw.s8 q12, d18
-++        vaddw.s8 q13, d19
-+ 
-+         vqmovun.s16 d4, q2
-+         vqmovun.s16 d5, q3
-+-        vqmovun.s16 d6, q4 // q4 free
-+-        vqmovun.s16 d7, q5 // q5 free
-+-
-+-        // free q4 -q9, q12 - q13
-+-        vmovl.s8 q4, d20
-+-        vmovl.s8 q5, d21 // q10 free
-+-        vmovl.s8 q6, d22
-+-        vmovl.s8 q7, d23 // q11 free
-+-
-+-        vmovl.u8  q8, d28
-+-        vmovl.u8  q9, d29 // q14 free
-+-        vmovl.u8 q10, d30
-+-        vmovl.u8 q11, d31 // q15 free
-+-
-+-        vadd.s16 q4, q8
-+-        vadd.s16 q5, q9
-+-        vadd.s16 q6, q10
-+-        vadd.s16 q7, q11
-+-
-+-        vqmovun.s16  d8, q4
-+-        vqmovun.s16  d9, q5
-+-        vqmovun.s16 d10, q6
-+-        vqmovun.s16 d11, q7
-+-
-+-        vstm.8   r0, {q2-q5}
-+-        add    r0, r2
-++        vqmovun.s16 d6, q12
-++        vqmovun.s16 d7, q13
-++
-++        vmovl.u8 q12, d28
-++        vmovl.u8 q13, d29
-++        vmovl.u8 q14, d30
-++        vmovl.u8 q15, d31
-++
-++        vaddw.s8 q12, d20
-++        vaddw.s8 q13, d21
-++        vaddw.s8 q14, d22
-++        vaddw.s8 q15, d23
-++
-++        vqmovun.s16  d8, q12
-++        vqmovun.s16  d9, q13
-++        vqmovun.s16 d10, q14
-++        vqmovun.s16 d11, q15
-++
-++        vst1.8     {q2-q3}, [r0,:128]!
-++        vst1.8     {q4-q5}, [r0,:128], r2
-++        sub    r0, #32
-+         bne    1b
-+ 
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+         bx lr
-+ endfunc
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 26bd536800db2f50ff6a021e1fda0d0394d1ea01 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Mon, 29 Dec 2014 15:00:49 +0200
-+Subject: [PATCH 5/9] better code reuse in NEON SAO band
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  16 ++--
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 155 +++++++++++++------------------------
-+ 2 files changed, 61 insertions(+), 110 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index c32940e..6379810 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -45,10 +45,10 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+ 
-+-void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+-void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+-void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+-void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+ 
-+ void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+@@ -185,16 +185,16 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+ 
-+     switch(width){
-+     case 8:
-+-        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+     case 16:
-+-        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+     case 32:
-+-        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+     case 64:
-+-        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+     default:
-+         for (y = 0; y < height; y++) {
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index ac21013..8852550 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -21,53 +21,13 @@
-+ #include "libavutil/arm/asm.S"
-+ #include "neon.S"
-+ 
-+-function ff_hevc_sao_band_w8_neon_8, export=1
-+-        ldr      r12, [sp, #4]    // offset_table address
-++.macro init_sao_band
-++        ldr      r12, [sp, #0]    // offset_table address
-+         vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #0]    // height
-+-
-+-1:      subs     r12, #1
-+-        vld1.8   {d24}, [r1,:64], r3
-+-        vshr.u8  d16, d24, #3
-+-        vtbl.8   d16, {q0, q1}, d16
-+-        vmovl.u8 q6, d24
-+-        vaddw.s8 q6, d16
-+-        vqmovun.s16 d4, q2
-+-        vst1.8  {d4}, [r0,:64], r2
-+-        bne    1b
-+-
-+-        bx lr
-+-endfunc
-+-
-+-function ff_hevc_sao_band_w16_neon_8, export=1
-+-        ldr      r12, [sp, #4]    // offset_table address
-+-        vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #0]    // height
-+-
-+-1:      subs     r12, #1
-+-        vld1.8  {q12}, [r1,:128], r3
-+-        vshr.u8   q8, q12, #3
-+-        vtbl.8  d16, {q0, q1}, d16
-+-        vtbl.8  d17, {q0, q1}, d17
-+-        vmovl.u8 q10, d24
-+-        vmovl.u8 q11, d25
-+-        vaddw.s8 q10, d16
-+-        vaddw.s8 q11, d17
-+-        vqmovun.s16 d4, q10
-+-        vqmovun.s16 d5, q11
-+-        vst1.8   {q2}, [r0,:128], r2
-+-        bne    1b
-+-
-+-        bx lr
-+-endfunc
-+-
-+-function ff_hevc_sao_band_w32_neon_8, export=1
-+-        ldr      r12, [sp, #4]    // offset_table address
-+-        vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #0]    // height
-++        ldr      r12, [sp, #4]    // height
-++.endm
-+ 
-+-1:      subs     r12, #1
-+-        vld1.8   {q2-q3}, [r1,:128], r3
-++.macro sao_band_32
-+         vshr.u8  q8, q2, #3
-+         vshr.u8  q9, q3, #3
-+         vtbl.8   d16, {q0, q1}, d16
-+@@ -86,6 +46,43 @@ function ff_hevc_sao_band_w32_neon_8, export=1
-+         vqmovun.s16 d5, q13
-+         vqmovun.s16 d6, q14
-+         vqmovun.s16 d7, q15
-++.endm
-++
-++function ff_hevc_sao_band_w8_neon_8, export=1
-++        init_sao_band
-++1:      subs     r12, #4
-++        vld1.8   {d4}, [r1,:64], r3
-++        vld1.8   {d5}, [r1,:64], r3
-++        vld1.8   {d6}, [r1,:64], r3
-++        vld1.8   {d7}, [r1,:64], r3
-++        sao_band_32
-++        vst1.8  {d4}, [r0,:64], r2
-++        vst1.8  {d5}, [r0,:64], r2
-++        vst1.8  {d6}, [r0,:64], r2
-++        vst1.8  {d7}, [r0,:64], r2
-++        bne    1b
-++
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w16_neon_8, export=1
-++        init_sao_band
-++1:      subs     r12, #2
-++        vld1.8  {q2}, [r1,:128], r3
-++        vld1.8  {q3}, [r1,:128], r3
-++        sao_band_32
-++        vst1.8   {q2}, [r0,:128], r2
-++        vst1.8   {q3}, [r0,:128], r2
-++        bne    1b
-++
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w32_neon_8, export=1
-++        init_sao_band
-++1:      subs     r12, #1
-++        vld1.8   {q2-q3}, [r1,:128], r3
-++        sao_band_32
-+         vst1.8   {q2-q3}, [r0,:128], r2
-+         bne      1b
-+ 
-+@@ -93,63 +90,17 @@ function ff_hevc_sao_band_w32_neon_8, export=1
-+ endfunc
-+ 
-+ function ff_hevc_sao_band_w64_neon_8, export=1
-+-        ldr      r12, [sp, #4]    // offset_table address
-+-        vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #0]    // height
-+-
-+-1:      subs     r12, #1
-+-        vld1.8  {q12-q13}, [r1,:128]!
-+-        vld1.8  {q14-q15}, [r1,:128], r3
-+-        sub     r1, #32
-+-
-+-        vshr.u8   q8, q12, #3
-+-        vshr.u8   q9, q13, #3
-+-        vshr.u8  q10, q14, #3
-+-        vshr.u8  q11, q15, #3
-+-
-+-        vtbl.8  d16, {q0, q1}, d16
-+-        vtbl.8  d17, {q0, q1}, d17
-+-        vtbl.8  d18, {q0, q1}, d18
-+-        vtbl.8  d19, {q0, q1}, d19
-+-        vtbl.8  d20, {q0, q1}, d20
-+-        vtbl.8  d21, {q0, q1}, d21
-+-        vtbl.8  d22, {q0, q1}, d22
-+-        vtbl.8  d23, {q0, q1}, d23
-+-
-+-        vmovl.u8 q2, d24
-+-        vmovl.u8 q3, d25
-+-        vmovl.u8 q12, d26
-+-        vmovl.u8 q13, d27
-+-
-+-        vaddw.s8 q2, d16
-+-        vaddw.s8 q3, d17
-+-        vaddw.s8 q12, d18
-+-        vaddw.s8 q13, d19
-+-
-+-        vqmovun.s16 d4, q2
-+-        vqmovun.s16 d5, q3
-+-        vqmovun.s16 d6, q12
-+-        vqmovun.s16 d7, q13
-+-
-+-        vmovl.u8 q12, d28
-+-        vmovl.u8 q13, d29
-+-        vmovl.u8 q14, d30
-+-        vmovl.u8 q15, d31
-+-
-+-        vaddw.s8 q12, d20
-+-        vaddw.s8 q13, d21
-+-        vaddw.s8 q14, d22
-+-        vaddw.s8 q15, d23
-+-
-+-        vqmovun.s16  d8, q12
-+-        vqmovun.s16  d9, q13
-+-        vqmovun.s16 d10, q14
-+-        vqmovun.s16 d11, q15
-+-
-+-        vst1.8     {q2-q3}, [r0,:128]!
-+-        vst1.8     {q4-q5}, [r0,:128], r2
-+-        sub    r0, #32
-+-        bne    1b
-++        init_sao_band
-++1:      subs      r12, #1
-++        vld1.8    {q2-q3}, [r1,:128]!
-++        sao_band_32
-++        vst1.8    {q2-q3}, [r0,:128]!
-++        vld1.8    {q2-q3}, [r1,:128], r3
-++        sub       r1, #32
-++        sao_band_32
-++        vst1.8    {q2-q3}, [r0,:128], r2
-++        sub       r0, #32
-++        bne       1b
-+ 
-+         bx lr
-+ endfunc
-+-- 
-+2.5.0
-+
-+
-+From f93646a97bc885b81759e774d04be3781916a3e7 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Wed, 7 Jan 2015 15:27:38 +0200
-+Subject: [PATCH 6/9] More SAO NEON optimizations Now uses only 8 bit integers
-+ for SAO calculations
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |   7 +-
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 664 +++++++++++++++----------------------
-+ 2 files changed, 272 insertions(+), 399 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 6379810..8d6e863 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -225,7 +225,7 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+     int x, y;
-+ 
-+     for (x = 0; x < 5; x++) {
-+-        sao_offset_val[x] = _sao_offset_val[x];
-++        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-+     }
-+ 
-+     stride_src /= sizeof(pixel);
-+@@ -271,8 +271,9 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+             for (x = 0; x < width; x++) {
-+                 int diff0         = CMP(src[x], src[x + a_stride]);
-+                 int diff1         = CMP(src[x], src[x + b_stride]);
-+-                int offset_val    = edge_idx[2 + diff0 + diff1];
-+-                dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-++                int idx           = diff0 + diff1;
-++                if (idx)
-++                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
-+             }
-+             src += stride_src;
-+             dst += stride_dst;
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 8852550..5fc482b 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -1,5 +1,5 @@
-+ /*
-+- * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+  *
-+  * This file is part of FFmpeg.
-+  *
-+@@ -23,6 +23,7 @@
-+ 
-+ .macro init_sao_band
-+         ldr      r12, [sp, #0]    // offset_table address
-++        pld      [r1]
-+         vld1.8   {q0, q1}, [r12]  // offset table
-+         ldr      r12, [sp, #4]    // height
-+ .endm
-+@@ -30,36 +31,31 @@
-+ .macro sao_band_32
-+         vshr.u8  q8, q2, #3
-+         vshr.u8  q9, q3, #3
-++        vmov.u8  q14, #128
-+         vtbl.8   d16, {q0, q1}, d16
-+         vtbl.8   d17, {q0, q1}, d17
-+         vtbl.8   d18, {q0, q1}, d18
-+         vtbl.8   d19, {q0, q1}, d19
-+-        vmovl.u8 q12, d4
-+-        vmovl.u8 q13, d5
-+-        vmovl.u8 q14, d6
-+-        vmovl.u8 q15, d7
-+-        vaddw.s8 q12, d16
-+-        vaddw.s8 q13, d17
-+-        vaddw.s8 q14, d18
-+-        vaddw.s8 q15, d19
-+-        vqmovun.s16 d4, q12
-+-        vqmovun.s16 d5, q13
-+-        vqmovun.s16 d6, q14
-+-        vqmovun.s16 d7, q15
-++        vadd.s8  q2, q14
-++        vadd.s8  q3, q14
-++        vqadd.s8 q2, q8
-++        vqadd.s8 q3, q9
-++        vsub.s8  q2, q14
-++        vsub.s8  q3, q14
-+ .endm
-+ 
-+ function ff_hevc_sao_band_w8_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #4
-+-        vld1.8   {d4}, [r1,:64], r3
-+-        vld1.8   {d5}, [r1,:64], r3
-+-        vld1.8   {d6}, [r1,:64], r3
-+-        vld1.8   {d7}, [r1,:64], r3
-++        vld1.8   {d4}, [r1, :64], r3
-++        vld1.8   {d5}, [r1, :64], r3
-++        vld1.8   {d6}, [r1, :64], r3
-++        vld1.8   {d7}, [r1, :64], r3
-+         sao_band_32
-+-        vst1.8  {d4}, [r0,:64], r2
-+-        vst1.8  {d5}, [r0,:64], r2
-+-        vst1.8  {d6}, [r0,:64], r2
-+-        vst1.8  {d7}, [r0,:64], r2
-++        vst1.8  {d4}, [r0, :64], r2
-++        vst1.8  {d5}, [r0, :64], r2
-++        vst1.8  {d6}, [r0, :64], r2
-++        vst1.8  {d7}, [r0, :64], r2
-+         bne    1b
-+ 
-+         bx lr
-+@@ -68,11 +64,11 @@ endfunc
-+ function ff_hevc_sao_band_w16_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #2
-+-        vld1.8  {q2}, [r1,:128], r3
-+-        vld1.8  {q3}, [r1,:128], r3
-++        vld1.8  {q2}, [r1, :128], r3
-++        vld1.8  {q3}, [r1, :128], r3
-+         sao_band_32
-+-        vst1.8   {q2}, [r0,:128], r2
-+-        vst1.8   {q3}, [r0,:128], r2
-++        vst1.8   {q2}, [r0, :128], r2
-++        vst1.8   {q3}, [r0, :128], r2
-+         bne    1b
-+ 
-+         bx lr
-+@@ -81,9 +77,9 @@ endfunc
-+ function ff_hevc_sao_band_w32_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #1
-+-        vld1.8   {q2-q3}, [r1,:128], r3
-++        vld1.8   {q2-q3}, [r1, :128], r3
-+         sao_band_32
-+-        vst1.8   {q2-q3}, [r0,:128], r2
-++        vst1.8   {q2-q3}, [r0, :128], r2
-+         bne      1b
-+ 
-+         bx       lr
-+@@ -92,263 +88,153 @@ endfunc
-+ function ff_hevc_sao_band_w64_neon_8, export=1
-+         init_sao_band
-+ 1:      subs      r12, #1
-+-        vld1.8    {q2-q3}, [r1,:128]!
-++        pld       [r1, r3]
-++        vld1.8    {q2-q3}, [r1, :128]!
-+         sao_band_32
-+-        vst1.8    {q2-q3}, [r0,:128]!
-+-        vld1.8    {q2-q3}, [r1,:128], r3
-++        vst1.8    {q2-q3}, [r0, :128]!
-++        vld1.8    {q2-q3}, [r1, :128], r3
-+         sub       r1, #32
-+         sao_band_32
-+-        vst1.8    {q2-q3}, [r0,:128], r2
-++        vst1.8    {q2-q3}, [r0, :128], r2
-+         sub       r0, #32
-+         bne       1b
-+ 
-+         bx lr
-+ endfunc
-+-
-++// input
-++// a in q0 - q3
-++// c in q4 - q7
-++// b in q8 - q11
-++// offset table in r7 and r5
-++// output in q0 - q3
-++// clobbers q12 - q15
-+ .macro edge_w64_body
-+-        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-+-        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-+-        vcgt.u8 q13, q5, q1
-+-        vcgt.u8 q1,  q1, q5
-+-        vcgt.u8 q14, q6, q2
-+-        vcgt.u8 q2,  q2, q6
-+-        vcgt.u8 q15, q7, q3
-+-        vcgt.u8 q3,  q3, q7
-+-
-+-        vsub.s8 q12, q0, q12 // diff0
-+-        vsub.s8 q13, q1, q13
-+-        vsub.s8 q14, q2, q14
-+-        vsub.s8 q15, q3, q15
-+-
-++        vcgt.u8 q12,  q4, q0 // c > a -> -1 , otherwise 0
-++        vcgt.u8  q0,  q0, q4 // a > c -> -1 , otherwise 0
-++        vcgt.u8 q13,  q5, q1
-++        vcgt.u8  q1,  q1, q5
-++        vsub.s8 q12,  q0, q12 // diff0
-+         vcgt.u8  q0,  q4, q8 // c > b
-+-        vcgt.u8  q8,  q8, q4 // b > c
-++        vsub.s8 q13,  q1, q13
-++
-++        vcgt.u8 q14,  q8, q4 // b > c
-+         vcgt.u8  q1,  q5, q9
-+-        vcgt.u8  q9,  q9, q5
-+-        vcgt.u8  q2,  q6, q10
-+-        vcgt.u8 q10, q10, q6
-+-        vcgt.u8  q3,  q7, q11
-+-        vcgt.u8 q11, q11, q7
-++        vcgt.u8 q15,  q9, q5
-++        vsub.s8  q0, q14, q0 // diff1
-+ 
-+-        vsub.s8 q0, q8, q0 // diff1
-+-        vsub.s8 q1, q9, q1
-+-        vsub.s8 q2, q10, q2
-+-        vsub.s8 q3, q11, q3
-++        vsub.s8  q1, q15, q1
-+ 
-+-        vadd.s8 q0, q12 //diff0 + diff1
-+-        vadd.s8 q1, q13
-+-        vadd.s8 q2, q14
-+-        vadd.s8 q3, q15
-+-
-+-        vdup.s8 q9, r6 // 3 to all elements
-+-        sub     r6, #1
-+-
-+-        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-+-        vclt.s8 q13, q1, #0
-+-        vclt.s8 q14, q2, #0
-+-        vclt.s8 q15, q3, #0
-+-
-+-        vadd.s8  q8,  q0, q9 // diff0 + diff1 + 3
-+-        vadd.s8  q10,  q1, q9
-+-        vand.8   q12, q8, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-+-        vand.8   q13, q10, q13
-+-        vadd.s8  q8,  q2, q9
-+-        vadd.s8  q10,  q3, q9
-+-        vand.8   q14, q8, q14
-+-        vand.8   q15, q10, q15
-+-
-+-        vdup.s8 q9, r6  // 2 to all elements
-+-        add     r6, #1
-+-
-+-        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-+-        vadd.s8   q8, q0, q9 // diff0 + diff1 + 2
-+-        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vcgt.s8  q10, q1, #0
-+-        vadd.s8   q0, q11, q12  // offset_idx
-+-
-+-        vadd.s8   q8, q1, q9 // diff0 + diff1 + 2
-+-        vcgt.s8  q12, q2, #0
-+-        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vadd.s8   q8, q2, q9 // diff0 + diff1 + 2
-+-        vadd.s8   q1, q11, q13
-+-
-+-        vand.8   q11, q8, q12 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vcgt.s8  q10, q3, #0
-+-        vadd.s8   q2, q11, q14
-+-
-+-        vadd.s8   q8, q3, q9 // diff0 + diff1 + 2
-+-        vmov.32  d18[0], r7  // load offset table from general registers
-+-        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vmov.32  d18[1], r5  // load rest of offset table
-+-        vadd.s8   q3, q11, q15
-+-
-+-        vtbl.8   d0, {d18}, d0
-+-        vtbl.8   d1, {d18}, d1
-+-        vtbl.8   d2, {d18}, d2
-+-        vtbl.8   d3, {d18}, d3
-+-        vtbl.8   d4, {d18}, d4
-+-        vtbl.8   d5, {d18}, d5
-+-        vtbl.8   d6, {d18}, d6
-+-        vtbl.8   d7, {d18}, d7
-+-
-+-        vmovl.u8   q8, d8
-+-        vmovl.u8   q9, d9
-+-        vmovl.u8  q10, d10
-+-        vmovl.u8  q11, d11
-+-        vmovl.u8  q12, d12
-+-        vmovl.u8  q13, d13
-+-        vmovl.u8  q14, d14
-+-        vmovl.u8  q15, d15
-+-
-+-        vaddw.s8  q8, d0
-+-        vaddw.s8  q9, d1
-+-        vaddw.s8 q10, d2
-+-        vaddw.s8 q11, d3
-+-        vaddw.s8 q12, d4
-+-        vaddw.s8 q13, d5
-+-        vaddw.s8 q14, d6
-+-        vaddw.s8 q15, d7
-+-
-+-        vqmovun.s16  d0, q8
-+-        vqmovun.s16  d1, q9
-+-        vqmovun.s16  d2, q10
-+-        vqmovun.s16  d3, q11
-+-        vqmovun.s16  d4, q12
-+-        vqmovun.s16  d5, q13
-+-        vqmovun.s16  d6, q14
-+-        vqmovun.s16  d7, q15
-+-
-+-        vstm r0, {q0-q3}
-+-        add  r0, r2
-+-.endm
-++        vadd.s8  q0, q12 //diff0 + diff1
-++        vadd.s8  q1, q13
-+ 
-+-.macro edge_w32_body
-+-        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-+-        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-+-        vcgt.u8 q13, q5, q1
-+-        vcgt.u8 q1,  q1, q5
-++        vcgt.u8 q14,  q6, q2
-++        vcgt.u8  q2,  q2, q6
-++        vcgt.u8 q15,  q7, q3
-++        vcgt.u8  q3,  q3, q7
-+ 
-+-        vsub.s8 q12, q0, q12 // diff0
-+-        vcgt.u8  q0,  q4, q8 // c > b
-+-        vsub.s8 q13, q1, q13 // diff0 part 2
-++        vsub.s8 q14,  q2, q14
-++        vcgt.u8  q2,  q6, q10
-++        vsub.s8 q15,  q3, q15
-+ 
-+-        vcgt.u8  q6,  q8, q4 // b > c
-+-        vcgt.u8  q1,  q5, q9
-+-        vcgt.u8  q7,  q9, q5
-++        vcgt.u8 q12, q10, q6
-++        vcgt.u8  q3,  q7, q11
-++        vcgt.u8 q13, q11, q7
-++        vsub.s8  q2, q12, q2
-++        vsub.s8  q3, q13, q3
-+ 
-+-        vsub.s8 q0, q6, q0 // diff1
-+-        vsub.s8 q1, q7, q1 // diff1 part 2
-+-        vadd.s8 q0, q12 //diff0 + diff1
-++        vmov.s8 q13, #2 // 2 to all elements
-+ 
-+-        vdup.s8 q7, r6 // 3 to all elements
-+-        sub     r6, #1
-+-        vadd.s8 q1, q13
-++        vadd.s8  q2, q14
-++        vadd.s8  q3, q15
-++
-++        vmov.32  d24[0], r4  // load offset table from general registers
-++        vmov.32  d24[1], r5  // load rest of offset table
-+ 
-+-        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-+-        vclt.s8 q13, q1, #0
-+-
-+-        vadd.s8  q6,  q0, q7 // diff0 + diff1 + 3
-+-        vadd.s8  q10,  q1, q7
-+-        vdup.s8 q7, r6  // 2 to all elements
-+-        add     r6, #1
-+-        vand.8   q12, q6, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-+-        vand.8   q13, q10, q13
-+-
-+-
-+-        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-+-        vadd.s8   q6, q0, q7 // diff0 + diff1 + 2
-+-        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vcgt.s8  q10, q1, #0
-+-        vadd.s8   q0, q11, q12  // offset_idx
-+-
-+-        vadd.s8   q6, q1, q7 // diff0 + diff1 + 2
-+-        vmov.32  d14[0], r7  // load offset table from general registers
-+-        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vmov.32  d14[1], r5  // load rest of offset table
-+-        vadd.s8   q1, q11, q13
-+-
-+-        vtbl.8   d0, {d14}, d0
-+-        vtbl.8   d1, {d14}, d1
-+-        vtbl.8   d2, {d14}, d2
-+-        vtbl.8   d3, {d14}, d3
-+-
-+-        vmovl.u8   q6, d8
-+-        vmovl.u8   q7, d9
-+-        vmovl.u8  q10, d10
-+-        vmovl.u8  q11, d11
-+-
-+-        vaddw.s8  q6, d0
-+-        vaddw.s8  q7, d1
-+-        vaddw.s8 q10, d2
-+-        vaddw.s8 q11, d3
-+-
-+-        vqmovun.s16  d0, q6
-+-        vqmovun.s16  d1, q7
-+-        vqmovun.s16  d2, q10
-+-        vqmovun.s16  d3, q11
-+-
-+-        vstm r0, {q0-q1}
-+-        add  r0, r2
-++        vadd.s8 q0, q13
-++        vadd.s8 q1, q13
-++        vadd.s8 q2, q13
-++        vadd.s8 q3, q13
-++
-++        vmov.u8  q15, #128 // s8 #-128
-++        vtbl.8   d0, {d24}, d0
-++        vtbl.8   d1, {d24}, d1
-++        vtbl.8   d2, {d24}, d2
-++        vtbl.8   d3, {d24}, d3
-++        vtbl.8   d4, {d24}, d4
-++        vtbl.8   d5, {d24}, d5
-++        vtbl.8   d6, {d24}, d6
-++        vtbl.8   d7, {d24}, d7
-++
-++        vadd.s8  q12,  q4, q15
-++        vadd.s8  q13,  q5, q15
-++        vadd.s8  q14,  q6, q15
-++        vadd.s8  q15,  q7, q15
-++        vqadd.s8 q12,  q0
-++        vqadd.s8 q15,  q3
-++        vmov.u8   q3, #128 // s8 #-128
-++        vqadd.s8 q13,  q1
-++        vqadd.s8 q14,  q2
-++        vsub.s8   q0, q12, q3
-++        vsub.s8   q1, q13, q3
-++        vsub.s8   q2, q14, q3
-++        vsub.s8   q3, q15, q3
-++        vst1.8  {q0-q1}, [r0, :128]!
-++        vst1.8  {q2-q3}, [r0, :128], r2
-++        sub     r0, #32
-+ .endm
-+ 
-+-function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-++.macro init_edge_64
-++        push   {r4-r5}
-++        ldr    r12, [sp, #8] // height
-++        ldr    r5, [sp, #12] // sao_offset_val_table
-++        ldr    r4, [r5]
-+         add    r5, #4
-+         ldr    r5, [r5]
-++.endm
-++
-++function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-++        init_edge_64
-+         vpush {d8-d15}
-+         sub    r1, #8
-+-1:      subs    r4, #1
-+-        vld1.64  {q10-q11}, [r1]!
-+-        vld1.64  {q12-q13}, [r1]!
-+-        vld1.64  {q14}, [r1], r3
-+-        sub      r1, #64
-++1:      subs    r12, #1
-++        vld1.64  {d7}, [r1, :64]!
-++        vld1.64  {q4-q5}, [r1, :128]! // load c
-++        vld1.64  {q6-q7}, [r1, :128]!
-++        vld1.64  {d24}, [r1, :64], r3
-++        sub      r1, #72
-+         // load a
-+-        vext.8 q0, q10, q11, #7
-+-        vext.8 q1, q11, q12, #7
-+-        vext.8 q2, q12, q13, #7
-+-        vext.8 q3, q13, q14, #7
-+-        // load c
-+-        vext.8 q4, q10, q11, #8
-+-        vext.8 q5, q11, q12, #8
-+-        vext.8 q6, q12, q13, #8
-+-        vext.8 q7, q13, q14, #8
-++        vext.8 q0, q3, q4, #15
-++        vext.8 q1, q4, q5, #15
-++        vext.8 q2, q5, q6, #15
-++        vext.8 q3, q6, q7, #15
-+         // load b
-+-        vext.8 q8, q10, q11, #9
-+-        vext.8 q9, q11, q12, #9
-+-        vext.8 q10, q12, q13, #9
-+-        vext.8 q11, q13, q14, #9
-++        vext.8 q8, q4, q5, #1
-++        vext.8 q9, q5, q6, #1
-++        vext.8 q10, q6, q7, #1
-++        vext.8 q11, q7, q12, #1
-+         edge_w64_body
-+         bne   1b
-+         vpop  {d8-d15}
-+-        pop   {r4-r8}
-++        pop   {r4-r5}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-++        init_edge_64
-+         vpush {d8-d15}
-+         sub     r1, r3
-+         // load a
-+-        vld1.8  {q0-q1}, [r1]!
-+-        vld1.8  {q2-q3}, [r1], r3
-++        vld1.8  {q0-q1}, [r1, :128]!
-++        vld1.8  {q2-q3}, [r1, :128], r3
-+         sub     r1, #32
-+-1:      subs    r4, #1
-+         // load c
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-++        vld1.8  {q4-q5}, [r1, :128]!
-++        vld1.8  {q6-q7}, [r1, :128], r3
-+         sub     r1, #32
-++1:      subs    r12, #1
-+         // load b
-+-        vld1.8  {q8-q9}, [r1]!
-+-        vld1.8  {q10-q11}, [r1]
-++        vld1.8  {q8-q9}, [r1, :128]!
-++        vld1.8  {q10-q11}, [r1, :128], r3
-+         sub     r1, #32
-+         edge_w64_body
-+         // copy c to a
-+@@ -356,20 +242,19 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+         vmov.64 q1, q5
-+         vmov.64 q2, q6
-+         vmov.64 q3, q7
-++        // copy b to c
-++        vmov.64 q4, q8
-++        vmov.64 q5, q9
-++        vmov.64 q6, q10
-++        vmov.64 q7, q11
-+         bne   1b
-+         vpop  {d8-d15}
-+-        pop   {r4-r8}
-++        pop   {r4-r5}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-++        init_edge_64
-+         vpush {d8-d15}
-+ 1:      sub     r1, r3
-+         // load a
-+@@ -379,10 +264,10 @@ function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+         vld1.8  {q0-q1}, [r1]!
-+         vld1.8  {q2-q3}, [r1], r3
-+         sub     r1, #31
-+-        subs    r4, #1
-++        subs    r12, #1
-+         // load c
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-++        vld1.8  {q4-q5}, [r1, :128]!
-++        vld1.8  {q6-q7}, [r1, :128], r3
-+         sub     r1, #32
-+         // load b
-+         add     r1, #1
-+@@ -390,25 +275,14 @@ function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+         vld1.8  {q10-q11}, [r1]
-+         sub     r1, #33
-+         edge_w64_body
-+-        // copy c to a
-+-        vmov.64 q0, q4
-+-        vmov.64 q1, q5
-+-        vmov.64 q2, q6
-+-        vmov.64 q3, q7
-+         bne   1b
-+         vpop  {d8-d15}
-+-        pop   {r4-r8}
-++        pop   {r4-r5}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-++        init_edge_64
-+         vpush {d8-d15}
-+ 1:      sub     r1, r3
-+         // load a
-+@@ -418,10 +292,10 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+         vld1.8  {q0-q1}, [r1]!
-+         vld1.8  {q2-q3}, [r1], r3
-+         sub     r1, #33
-+-        subs    r4, #1
-++        subs    r12, #1
-+         // load c
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-++        vld1.8  {q4-q5}, [r1, :128]!
-++        vld1.8  {q6-q7}, [r1, :128], r3
-+         sub     r1, #32
-+         // load b
-+         sub     r1, #1
-+@@ -429,178 +303,176 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+         vld1.8  {q10-q11}, [r1]
-+         sub     r1, #31
-+         edge_w64_body
-+-        // copy c to a
-+-        vmov.64 q0, q4
-+-        vmov.64 q1, q5
-+-        vmov.64 q2, q6
-+-        vmov.64 q3, q7
-+         bne   1b
-+         vpop  {d8-d15}
-+-        pop   {r4-r8}
-++        pop   {r4-r5}
-+         bx lr
-+ endfunc
-+ 
-++// inputs:
-++// a in q0, q1
-++// c in q2, q3
-++// b in q8, q9
-++// offset table in d31
-++// clobbered registers q0, q1, q10, q11, q12, q13
-++// output q0, q1
-++.macro edge_w32_body
-++        vcgt.u8 q12, q2, q0 // c > a -> -1 , otherwise 0
-++        vcgt.u8 q0,  q0, q2 // a > c -> -1 , otherwise 0
-++        vcgt.u8 q13, q3, q1
-++        vcgt.u8 q1,  q1, q3
-++
-++        vsub.s8 q12, q0, q12 // diff0
-++        vcgt.u8  q0,  q2, q8 // c > b
-++        vsub.s8 q13, q1, q13 // diff0 part 2
-++
-++        vcgt.u8  q10,  q8, q2 // b > c
-++        vcgt.u8  q1,  q3, q9
-++        vcgt.u8  q11,  q9, q3
-++
-++        vsub.s8 q0, q10, q0 // diff1
-++
-++        vmov.s8 q10, #2 // 2 to all elements
-++        vsub.s8 q1, q11, q1 // diff1 part 2
-++        vadd.s8 q0, q12 //diff0 + diff1
-++        vadd.s8 q1, q13
-++
-++        vadd.s8 q0, q10
-++        vadd.s8 q1, q10
-++
-++        vmov.u8  q10, #128
-++        vtbl.8   d0, {d31}, d0
-++        vtbl.8   d1, {d31}, d1
-++        vtbl.8   d2, {d31}, d2
-++        vtbl.8   d3, {d31}, d3
-++
-++        vadd.s8    q11, q2, q10
-++        vadd.s8    q12, q3, q10
-++        vqadd.s8   q11, q0
-++        vqadd.s8   q12, q1
-++        vsub.s8    q0, q11, q10
-++        vsub.s8    q1, q12, q10
-++        vst1.8   {q0-q1}, [r0, :128], r2
-++.endm
-++
-++.macro init_edge_32
-++        ldr     r12, [sp, #4] // sao_offset_val_table
-++        vld1.32 {d31}, [r12]
-++        ldr     r12, [sp] // height
-++.endm
-++
-+ function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-+-        vpush {d8-d15}
-+-        sub    r1, #8 // load 8 extra bytes
-+-1:      subs    r4, #1
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3 // only first 9 bytes are used
-+-        sub    r1, #32
-++        init_edge_32
-++        sub     r1, #4 // load 4 extra bytes
-++1:      subs    r12, #1
-++        vld1.32 d3[1], [r1]!
-++        vld1.8  {q2-q3}, [r1, :128]! // c
-++        vld1.32 d20[0], [r1], r3
-++        sub     r1, #36
-+         // a
-+-        vext.8  q0, q10, q11, #7
-+-        vext.8  q1, q11, q12, #7
-+-        // c
-+-        vext.8  q4, q10, q11, #8
-+-        vext.8  q5, q11, q12, #8
-++        vext.8  q0, q1, q2, #15
-++        vext.8  q1, q2, q3, #15
-+         // b
-+-        vext.8  q8, q10, q11, #9
-+-        vext.8  q9, q11, q12, #9
-++        vext.8  q8, q2, q3, #1
-++        vext.8  q9, q3, q10, #1
-+         edge_w32_body
-+-        bne   1b
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        bne     1b
-++        bx      lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-+-        vpush {d8-d15}
-++        init_edge_32
-+         // load a
-+         sub     r1, r3
-+-        vld1.8  {q0-q1}, [r1], r3
-++        vld1.8  {q0-q1}, [r1, :128], r3
-+         // load c
-+-        vld1.8  {q4-q5}, [r1], r3
-+-1:      subs    r4, #1
-++        vld1.8  {q2-q3}, [r1, :128], r3
-++1:      subs    r12, #1
-+         // load b
-+-        vld1.8  {q8-q9}, [r1], r3
-++        vld1.8  {q8-q9}, [r1, :128], r3
-+         edge_w32_body
-+         // inputs for next loop iteration
-+         // a
-+-        vmov.64 q0, q4
-+-        vmov.64 q1, q5
-++        vmov.64 q0, q2
-++        vmov.64 q1, q3
-+         // c
-+-        vmov.64 q4, q8
-+-        vmov.64 q5, q9
-+-        bne   1b
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        vmov.64 q2, q8
-++        vmov.64 q3, q9
-++        bne     1b
-++        bx      lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-+-        vpush {d8-d15}
-++        init_edge_32
-++        vpush   {d8-d15}
-+         // load a
-+         sub     r1, r3
-+-        sub    r1, #8
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-++        sub     r1, #8
-++        vld1.8  {q10-q11}, [r1, :64]!
-++        vld1.8  {d24}, [r1, :64], r3
-++        sub     r1, #32
-+         vext.8  q0, q10, q11, #7
-+         vext.8  q1, q11, q12, #7
-+         // load c
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-+-        vext.8  q4, q10, q11, #8
-+-        vext.8  q5, q11, q12, #8
-+-        vext.8  q2, q10, q11, #7
-+-1:      subs    r4, #1
-++        vld1.8  {d9}, [r1, :64]!
-++        vld1.8  {q2-q3}, [r1, :64], r3
-++        sub     r1, #8
-++        vext.8  q4, q4, q2, #15
-++1:      subs    r12, #1
-+         // load b
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-++        vld1.8  {q10-q11}, [r1, :64]!
-++        vld1.8  {q12}, [r1, :64], r3
-++        sub     r1, #32
-+         vext.8  q8, q10, q11, #9
-+         vext.8  q9, q11, q12, #9
-+-        vext.8  q14, q10, q11, #8
-+-        vext.8  q15, q11, q12, #8
-+-        vext.8  q3, q10, q11, #7
-++        vext.8  q6, q10, q11, #8
-++        vext.8  q7, q11, q12, #8
-++        vext.8  q5, q10, q11, #7
-+         edge_w32_body
-+         // inputs for next loop iteration
-+         // a
-+-        vmov.8 q0, q2
-+-        vext.8 q1, q4, q5, #15
-++        vmov.8  q0, q4
-++        vext.8  q1, q2, q3, #15
-+         // c
-+-        vmov.8  q4, q14
-+-        vmov.8  q5, q15
-+-        vmov.8  q2, q3
-+-        bne   1b
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        vmov.8  q2, q6
-++        vmov.8  q3, q7
-++        vmov.8  q4, q5
-++        bne     1b
-++        vpop    {d8-d15}
-++        bx      lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        sub    r1, r3
-+-        ldr    r5, [r5]
-+-        sub    r1, #8
-+-        vpush {d8-d15}
-++        init_edge_32
-++        sub     r1, r3
-+         // load a
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-+-        vext.8  q0, q10, q11, #9
-+-        vext.8  q1, q11, q12, #9
-++        vld1.8  {q10-q11}, [r1, :64]!
-++        vld1.8  {d24}, [r1, :64], r3
-++        sub     r1, #32
-++        vext.8  q0, q10, q11, #1
-++        vext.8  q1, q11, q12, #1
-+         // load c
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-+-        vext.8  q4, q10, q11, #8
-+-        vext.8  q5, q11, q12, #8
-+-        vext.8  q2, q12, q11, #8
-+-1:      subs    r4, #1
-++        vld1.8  {q2-q3}, [r1, :64]!
-++        vld1.8  {d30}, [r1, :64], r3
-++        sub     r1, #40
-++1:      subs    r12, #1
-+         // load b
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-++        vld1.8  {q10-q11}, [r1, :64]!
-++        vld1.8  {q12}, [r1, :64], r3
-++        sub     r1, #32
-+         vext.8  q8, q10, q11, #7
-+         vext.8  q9, q11, q12, #7
-+-        vext.8  q3, q12, q10, #7
-++        vext.8  q14, q12, q10, #7
-+         edge_w32_body
-+         // inputs for next loop iteration
-+         // a
-+-        vext.8 q0, q4, q5, #1
-+-        vext.8 q1, q5, q2, #1
-++        vext.8  q0, q2, q3, #1
-++        vext.8  q1, q3, q15, #1
-+         // c
-+-        vext.8  q4, q8, q9, #1
-+-        vext.8  q5, q9, q3, #1
-+-        vext.8  q2, q3, q1, #1
-+-        bne   1b
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        vext.8  q2, q8, q9, #1
-++        vext.8  q3, q9, q14, #1
-++        vext.8  d30, d28, d2, #1
-++        bne     1b
-++        bx      lr
-+ endfunc
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 016c39d46b86830204a4519590332d2a38f7ee51 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Thu, 8 Jan 2015 09:58:55 +0200
-+Subject: [PATCH 7/9] small optimization to SAO BAND. correct path for
-+ bit_depth_template.c
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c | 2 +-
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 2 +-
-+ 2 files changed, 2 insertions(+), 2 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 8d6e863..385c35d 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -23,7 +23,7 @@
-+ #include "libavcodec/hevcdsp.h"
-+ #include "hevcdsp_arm.h"
-+ #include "libavcodec/avcodec.h"
-+-#include "../bit_depth_template.c"
-++#include "libavcodec/bit_depth_template.c"
-+ 
-+ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 5fc482b..710b32b 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -26,12 +26,12 @@
-+         pld      [r1]
-+         vld1.8   {q0, q1}, [r12]  // offset table
-+         ldr      r12, [sp, #4]    // height
-++        vmov.u8  q14, #128
-+ .endm
-+ 
-+ .macro sao_band_32
-+         vshr.u8  q8, q2, #3
-+         vshr.u8  q9, q3, #3
-+-        vmov.u8  q14, #128
-+         vtbl.8   d16, {q0, q1}, d16
-+         vtbl.8   d17, {q0, q1}, d17
-+         vtbl.8   d18, {q0, q1}, d18
-+-- 
-+2.5.0
-+
-+
-+From 579f1584d688e1ac24fb7d22697e2a7b64f62e8e Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Fri, 9 Jan 2015 10:28:52 +0200
-+Subject: [PATCH 8/9] Added height check for SAO NEON optimizations. Faster SAO
-+ band NEON Some reordering to use NEON pipelines more efficiently
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  12 +++-
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 142 ++++++++++++++++++++++---------------
-+ 2 files changed, 93 insertions(+), 61 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 385c35d..6d0689c 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -176,6 +176,7 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+     int8_t offset_table[32] = { 0 };
-+     int k, y, x;
-+     int shift  = 3; // BIT_DEPTH - 5
-++    int cwidth = 0;
-+ 
-+     stride_src /= sizeof(pixel);
-+     stride_dst /= sizeof(pixel);
-+@@ -183,7 +184,10 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+     for (k = 0; k < 4; k++)
-+         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+ 
-+-    switch(width){
-++    if (height % 8 == 0)
-++        cwidth = width;
-++
-++    switch(cwidth){
-+     case 8:
-+         ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+@@ -223,15 +227,19 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+     pixel *src = (pixel *)_src;
-+     int a_stride, b_stride;
-+     int x, y;
-++    int cwidth = 0;
-+ 
-+     for (x = 0; x < 5; x++) {
-+         sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-+     }
-+ 
-++    if (height % 8 == 0)
-++        cwidth = width;
-++
-+     stride_src /= sizeof(pixel);
-+     stride_dst /= sizeof(pixel);
-+ 
-+-    switch (width) {
-++    switch (cwidth) {
-+     case 32:
-+         switch(eo) {
-+         case 0:
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 710b32b..08f50b8 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -26,36 +26,59 @@
-+         pld      [r1]
-+         vld1.8   {q0, q1}, [r12]  // offset table
-+         ldr      r12, [sp, #4]    // height
-+-        vmov.u8  q14, #128
-++        vmov.u8  q3, #128
-+ .endm
-+ 
-+-.macro sao_band_32
-+-        vshr.u8  q8, q2, #3
-+-        vshr.u8  q9, q3, #3
-+-        vtbl.8   d16, {q0, q1}, d16
-+-        vtbl.8   d17, {q0, q1}, d17
-+-        vtbl.8   d18, {q0, q1}, d18
-+-        vtbl.8   d19, {q0, q1}, d19
-+-        vadd.s8  q2, q14
-+-        vadd.s8  q3, q14
-+-        vqadd.s8 q2, q8
-+-        vqadd.s8 q3, q9
-+-        vsub.s8  q2, q14
-+-        vsub.s8  q3, q14
-++// 128 in q3
-++// input q8 - q11
-++// 32 cycles
-++.macro sao_band_64
-++        vshr.u8  q12, q8, #3
-++        vshr.u8  q13, q9, #3
-++        vshr.u8  q14, q10, #3
-++        vshr.u8  q15, q11, #3
-++        vtbl.8   d24, {d0, d1, d2, d3}, d24
-++        vadd.s8  q8, q3
-++        vtbl.8   d25, {d0, d1, d2, d3}, d25
-++        vadd.s8  q9, q3
-++        vtbl.8   d26, {d0, d1, d2, d3}, d26
-++        vadd.s8  q10, q3
-++        vtbl.8   d27, {d0, d1, d2, d3}, d27
-++        vadd.s8  q11, q3
-++        vtbl.8   d28, {d0, d1, d2, d3}, d28
-++        vqadd.s8 q8, q12
-++        vtbl.8   d29, {d0, d1, d2, d3}, d29
-++        vqadd.s8 q9, q13
-++        vtbl.8   d30, {d0, d1, d2, d3}, d30
-++        vqadd.s8 q10, q14
-++        vtbl.8   d31, {d0, d1, d2, d3}, d31
-++        vqadd.s8 q11, q15
-++        vsub.s8  q8, q3
-++        vsub.s8  q9, q3
-++        vsub.s8  q10, q3
-++        vsub.s8  q11, q3
-+ .endm
-+ 
-+ function ff_hevc_sao_band_w8_neon_8, export=1
-+         init_sao_band
-+-1:      subs     r12, #4
-+-        vld1.8   {d4}, [r1, :64], r3
-+-        vld1.8   {d5}, [r1, :64], r3
-+-        vld1.8   {d6}, [r1, :64], r3
-+-        vld1.8   {d7}, [r1, :64], r3
-+-        sao_band_32
-+-        vst1.8  {d4}, [r0, :64], r2
-+-        vst1.8  {d5}, [r0, :64], r2
-+-        vst1.8  {d6}, [r0, :64], r2
-+-        vst1.8  {d7}, [r0, :64], r2
-++1:      subs     r12, #8
-++        vld1.8   {d16}, [r1, :64], r3
-++        vld1.8   {d17}, [r1, :64], r3
-++        vld1.8   {d18}, [r1, :64], r3
-++        vld1.8   {d19}, [r1, :64], r3
-++        vld1.8   {d20}, [r1, :64], r3
-++        vld1.8   {d21}, [r1, :64], r3
-++        vld1.8   {d22}, [r1, :64], r3
-++        vld1.8   {d23}, [r1, :64], r3
-++        sao_band_64
-++        vst1.8  {d16}, [r0, :64], r2
-++        vst1.8  {d17}, [r0, :64], r2
-++        vst1.8  {d18}, [r0, :64], r2
-++        vst1.8  {d19}, [r0, :64], r2
-++        vst1.8  {d20}, [r0, :64], r2
-++        vst1.8  {d21}, [r0, :64], r2
-++        vst1.8  {d22}, [r0, :64], r2
-++        vst1.8  {d23}, [r0, :64], r2
-+         bne    1b
-+ 
-+         bx lr
-+@@ -63,12 +86,16 @@ endfunc
-+ 
-+ function ff_hevc_sao_band_w16_neon_8, export=1
-+         init_sao_band
-+-1:      subs     r12, #2
-+-        vld1.8  {q2}, [r1, :128], r3
-+-        vld1.8  {q3}, [r1, :128], r3
-+-        sao_band_32
-+-        vst1.8   {q2}, [r0, :128], r2
-+-        vst1.8   {q3}, [r0, :128], r2
-++1:      subs     r12, #4
-++        vld1.8  {q8}, [r1, :128], r3
-++        vld1.8  {q9}, [r1, :128], r3
-++        vld1.8  {q10}, [r1, :128], r3
-++        vld1.8  {q11}, [r1, :128], r3
-++        sao_band_64
-++        vst1.8   {q8}, [r0, :128], r2
-++        vst1.8   {q9}, [r0, :128], r2
-++        vst1.8   {q10}, [r0, :128], r2
-++        vst1.8   {q11}, [r0, :128], r2
-+         bne    1b
-+ 
-+         bx lr
-+@@ -76,10 +103,12 @@ endfunc
-+ 
-+ function ff_hevc_sao_band_w32_neon_8, export=1
-+         init_sao_band
-+-1:      subs     r12, #1
-+-        vld1.8   {q2-q3}, [r1, :128], r3
-+-        sao_band_32
-+-        vst1.8   {q2-q3}, [r0, :128], r2
-++1:      subs     r12, #2
-++        vld1.8   {q8-q9}, [r1, :128], r3
-++        vld1.8   {q10-q11}, [r1, :128], r3
-++        sao_band_64
-++        vst1.8   {q8-q9}, [r0, :128], r2
-++        vst1.8   {q10-q11}, [r0, :128], r2
-+         bne      1b
-+ 
-+         bx       lr
-+@@ -89,13 +118,12 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+         init_sao_band
-+ 1:      subs      r12, #1
-+         pld       [r1, r3]
-+-        vld1.8    {q2-q3}, [r1, :128]!
-+-        sao_band_32
-+-        vst1.8    {q2-q3}, [r0, :128]!
-+-        vld1.8    {q2-q3}, [r1, :128], r3
-++        vld1.8    {q8-q9}, [r1, :128]!
-++        vld1.8    {q10-q11}, [r1, :128], r3
-+         sub       r1, #32
-+-        sao_band_32
-+-        vst1.8    {q2-q3}, [r0, :128], r2
-++        sao_band_64
-++        vst1.8    {q8-q9}, [r0, :128]!
-++        vst1.8    {q10-q11}, [r0, :128], r2
-+         sub       r0, #32
-+         bne       1b
-+ 
-+@@ -121,7 +149,6 @@ endfunc
-+         vcgt.u8  q1,  q5, q9
-+         vcgt.u8 q15,  q9, q5
-+         vsub.s8  q0, q14, q0 // diff1
-+-
-+         vsub.s8  q1, q15, q1
-+ 
-+         vadd.s8  q0, q12 //diff0 + diff1
-+@@ -157,27 +184,25 @@ endfunc
-+ 
-+         vmov.u8  q15, #128 // s8 #-128
-+         vtbl.8   d0, {d24}, d0
-++        vadd.s8  q13,  q4, q15
-+         vtbl.8   d1, {d24}, d1
-++        vadd.s8  q14,  q5, q15
-+         vtbl.8   d2, {d24}, d2
-++        vqadd.s8 q0, q13
-+         vtbl.8   d3, {d24}, d3
-++        vqadd.s8 q1, q14
-+         vtbl.8   d4, {d24}, d4
-++        vadd.s8  q13,  q6, q15
-+         vtbl.8   d5, {d24}, d5
-++        vadd.s8  q14,  q7, q15
-+         vtbl.8   d6, {d24}, d6
-++        vqadd.s8 q2, q13
-+         vtbl.8   d7, {d24}, d7
-+-
-+-        vadd.s8  q12,  q4, q15
-+-        vadd.s8  q13,  q5, q15
-+-        vadd.s8  q14,  q6, q15
-+-        vadd.s8  q15,  q7, q15
-+-        vqadd.s8 q12,  q0
-+-        vqadd.s8 q15,  q3
-+-        vmov.u8   q3, #128 // s8 #-128
-+-        vqadd.s8 q13,  q1
-+-        vqadd.s8 q14,  q2
-+-        vsub.s8   q0, q12, q3
-+-        vsub.s8   q1, q13, q3
-+-        vsub.s8   q2, q14, q3
-+-        vsub.s8   q3, q15, q3
-++        vqadd.s8 q3, q14
-++        vsub.s8   q0, q15
-++        vsub.s8   q1, q15
-++        vsub.s8   q2, q15
-++        vsub.s8   q3, q15
-+         vst1.8  {q0-q1}, [r0, :128]!
-+         vst1.8  {q2-q3}, [r0, :128], r2
-+         sub     r0, #32
-+@@ -342,13 +367,12 @@ endfunc
-+ 
-+         vmov.u8  q10, #128
-+         vtbl.8   d0, {d31}, d0
-++        vadd.s8  q11, q2, q10
-+         vtbl.8   d1, {d31}, d1
-++        vadd.s8  q12, q3, q10
-+         vtbl.8   d2, {d31}, d2
-++        vqadd.s8 q11, q0
-+         vtbl.8   d3, {d31}, d3
-+-
-+-        vadd.s8    q11, q2, q10
-+-        vadd.s8    q12, q3, q10
-+-        vqadd.s8   q11, q0
-+         vqadd.s8   q12, q1
-+         vsub.s8    q0, q11, q10
-+         vsub.s8    q1, q12, q10
-+-- 
-+2.5.0
-+
-+
-+From 026bac1824e4936e948e6b1efec82868c520ea66 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Mon, 2 Feb 2015 16:08:27 +0200
-+Subject: [PATCH 9/9] Further SAO NEON optimisations
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  16 +--
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 224 +++++++++++++++++++------------------
-+ 2 files changed, 124 insertions(+), 116 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 6d0689c..e5da7e9 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -45,10 +45,10 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+ 
-+-void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+-void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+-void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+-void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+ 
-+ void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+@@ -189,16 +189,16 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+ 
-+     switch(cwidth){
-+     case 8:
-+-        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-++        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+         break;
-+     case 16:
-+-        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-++        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+         break;
-+     case 32:
-+-        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-++        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+         break;
-+     case 64:
-+-        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-++        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+         break;
-+     default:
-+         for (y = 0; y < height; y++) {
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 08f50b8..9c7808d 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -22,21 +22,16 @@
-+ #include "neon.S"
-+ 
-+ .macro init_sao_band
-+-        ldr      r12, [sp, #0]    // offset_table address
-+         pld      [r1]
-+-        vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #4]    // height
-++        vld1.8   {q0, q1}, [r2]  // offset table
-++        ldr       r2, [sp, #0]   // stride_dst
-++        ldr      r12, [sp, #4]   // height
-+         vmov.u8  q3, #128
-+ .endm
-+ 
-+ // 128 in q3
-+ // input q8 - q11
-+-// 32 cycles
-+ .macro sao_band_64
-+-        vshr.u8  q12, q8, #3
-+-        vshr.u8  q13, q9, #3
-+-        vshr.u8  q14, q10, #3
-+-        vshr.u8  q15, q11, #3
-+         vtbl.8   d24, {d0, d1, d2, d3}, d24
-+         vadd.s8  q8, q3
-+         vtbl.8   d25, {d0, d1, d2, d3}, d25
-+@@ -52,8 +47,8 @@
-+         vtbl.8   d30, {d0, d1, d2, d3}, d30
-+         vqadd.s8 q10, q14
-+         vtbl.8   d31, {d0, d1, d2, d3}, d31
-+-        vqadd.s8 q11, q15
-+         vsub.s8  q8, q3
-++        vqadd.s8 q11, q15
-+         vsub.s8  q9, q3
-+         vsub.s8  q10, q3
-+         vsub.s8  q11, q3
-+@@ -64,12 +59,16 @@ function ff_hevc_sao_band_w8_neon_8, export=1
-+ 1:      subs     r12, #8
-+         vld1.8   {d16}, [r1, :64], r3
-+         vld1.8   {d17}, [r1, :64], r3
-++        vshr.u8  q12, q8, #3
-+         vld1.8   {d18}, [r1, :64], r3
-+         vld1.8   {d19}, [r1, :64], r3
-++        vshr.u8  q13, q9, #3
-+         vld1.8   {d20}, [r1, :64], r3
-+         vld1.8   {d21}, [r1, :64], r3
-++        vshr.u8  q14, q10, #3
-+         vld1.8   {d22}, [r1, :64], r3
-+         vld1.8   {d23}, [r1, :64], r3
-++        vshr.u8  q15, q11, #3
-+         sao_band_64
-+         vst1.8  {d16}, [r0, :64], r2
-+         vst1.8  {d17}, [r0, :64], r2
-+@@ -88,9 +87,13 @@ function ff_hevc_sao_band_w16_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #4
-+         vld1.8  {q8}, [r1, :128], r3
-++        vshr.u8  q12, q8, #3
-+         vld1.8  {q9}, [r1, :128], r3
-++        vshr.u8  q13, q9, #3
-+         vld1.8  {q10}, [r1, :128], r3
-++        vshr.u8  q14, q10, #3
-+         vld1.8  {q11}, [r1, :128], r3
-++        vshr.u8  q15, q11, #3
-+         sao_band_64
-+         vst1.8   {q8}, [r0, :128], r2
-+         vst1.8   {q9}, [r0, :128], r2
-+@@ -105,7 +108,11 @@ function ff_hevc_sao_band_w32_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #2
-+         vld1.8   {q8-q9}, [r1, :128], r3
-++        vshr.u8  q12, q8, #3
-++        vshr.u8  q13, q9, #3
-+         vld1.8   {q10-q11}, [r1, :128], r3
-++        vshr.u8  q14, q10, #3
-++        vshr.u8  q15, q11, #3
-+         sao_band_64
-+         vst1.8   {q8-q9}, [r0, :128], r2
-+         vst1.8   {q10-q11}, [r0, :128], r2
-+@@ -119,7 +126,11 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+ 1:      subs      r12, #1
-+         pld       [r1, r3]
-+         vld1.8    {q8-q9}, [r1, :128]!
-++        vshr.u8  q12, q8, #3
-++        vshr.u8  q13, q9, #3
-+         vld1.8    {q10-q11}, [r1, :128], r3
-++        vshr.u8  q14, q10, #3
-++        vshr.u8  q15, q11, #3
-+         sub       r1, #32
-+         sao_band_64
-+         vst1.8    {q8-q9}, [r0, :128]!
-+@@ -129,51 +140,18 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+ 
-+         bx lr
-+ endfunc
-+-// input
-+-// a in q0 - q3
-+-// c in q4 - q7
-+-// b in q8 - q11
-+-// offset table in r7 and r5
-+-// output in q0 - q3
-+-// clobbers q12 - q15
-+-.macro edge_w64_body
-+-        vcgt.u8 q12,  q4, q0 // c > a -> -1 , otherwise 0
-+-        vcgt.u8  q0,  q0, q4 // a > c -> -1 , otherwise 0
-+-        vcgt.u8 q13,  q5, q1
-+-        vcgt.u8  q1,  q1, q5
-+-        vsub.s8 q12,  q0, q12 // diff0
-+-        vcgt.u8  q0,  q4, q8 // c > b
-+-        vsub.s8 q13,  q1, q13
-+-
-+-        vcgt.u8 q14,  q8, q4 // b > c
-+-        vcgt.u8  q1,  q5, q9
-+-        vcgt.u8 q15,  q9, q5
-+-        vsub.s8  q0, q14, q0 // diff1
-+-        vsub.s8  q1, q15, q1
-+ 
-+-        vadd.s8  q0, q12 //diff0 + diff1
-+-        vadd.s8  q1, q13
-+-
-+-        vcgt.u8 q14,  q6, q2
-+-        vcgt.u8  q2,  q2, q6
-+-        vcgt.u8 q15,  q7, q3
-+-        vcgt.u8  q3,  q3, q7
-+-
-+-        vsub.s8 q14,  q2, q14
-+-        vcgt.u8  q2,  q6, q10
-+-        vsub.s8 q15,  q3, q15
-+-
-+-        vcgt.u8 q12, q10, q6
-+-        vcgt.u8  q3,  q7, q11
-+-        vcgt.u8 q13, q11, q7
-+-        vsub.s8  q2, q12, q2
-+-        vsub.s8  q3, q13, q3
-++.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
-++        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
-++        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
-++        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
-++        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
-++        vsub.s8 \out0, \tmp0, \out0 // diff0
-++        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
-++.endm
-+ 
-++.macro table64
-+         vmov.s8 q13, #2 // 2 to all elements
-+-
-+-        vadd.s8  q2, q14
-+-        vadd.s8  q3, q15
-+-
-+         vmov.32  d24[0], r4  // load offset table from general registers
-+         vmov.32  d24[1], r5  // load rest of offset table
-+ 
-+@@ -208,6 +186,28 @@ endfunc
-+         sub     r0, #32
-+ .endm
-+ 
-++// input
-++// a in q0 - q3
-++// c in q4 - q7
-++// b in q8 - q11
-++// offset table in r7 and r5
-++// output in q0 - q3
-++// clobbers q12 - q15
-++.macro edge_w64_body
-++        diff32 q12, q13, q0, q1, q0, q1, q4, q5
-++        diff32 q0, q1, q14, q15, q8, q9, q4, q5
-++
-++        vadd.s8  q0, q12 //diff0 + diff1
-++        vadd.s8  q1, q13
-++
-++        diff32  q14, q15, q2, q3, q2, q3, q6, q7
-++        diff32  q2, q3, q12, q13, q10, q11, q6, q7
-++
-++        vadd.s8  q2, q14
-++        vadd.s8  q3, q15
-++        table64
-++.endm
-++
-+ .macro init_edge_64
-+         push   {r4-r5}
-+         ldr    r12, [sp, #8] // height
-+@@ -334,38 +334,23 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+         bx lr
-+ endfunc
-+ 
-+-// inputs:
-+-// a in q0, q1
-+-// c in q2, q3
-+-// b in q8, q9
-+-// offset table in d31
-+-// clobbered registers q0, q1, q10, q11, q12, q13
-+-// output q0, q1
-+-.macro edge_w32_body
-+-        vcgt.u8 q12, q2, q0 // c > a -> -1 , otherwise 0
-+-        vcgt.u8 q0,  q0, q2 // a > c -> -1 , otherwise 0
-+-        vcgt.u8 q13, q3, q1
-+-        vcgt.u8 q1,  q1, q3
-+-
-+-        vsub.s8 q12, q0, q12 // diff0
-+-        vcgt.u8  q0,  q2, q8 // c > b
-+-        vsub.s8 q13, q1, q13 // diff0 part 2
-+-
-+-        vcgt.u8  q10,  q8, q2 // b > c
-+-        vcgt.u8  q1,  q3, q9
-+-        vcgt.u8  q11,  q9, q3
-+-
-+-        vsub.s8 q0, q10, q0 // diff1
-+-
-+-        vmov.s8 q10, #2 // 2 to all elements
-+-        vsub.s8 q1, q11, q1 // diff1 part 2
-+-        vadd.s8 q0, q12 //diff0 + diff1
-+-        vadd.s8 q1, q13
-++.macro init_edge_32
-++        ldr     r12, [sp, #4] // sao_offset_val_table
-++        vld1.32 {d31}, [r12]
-++        ldr     r12, [sp] // height
-++.endm
-+ 
-+-        vadd.s8 q0, q10
-+-        vadd.s8 q1, q10
-++.macro diff out0, tmp0, in0, in1
-++        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
-++        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
-++        vsub.s8 \out0, \tmp0, \out0 // diff0
-++.endm
-+ 
-+-        vmov.u8  q10, #128
-++.macro table32
-++        vmov.s8  q10, #2
-++        vadd.s8  q0, q10
-++        vadd.s8  q1, q10
-++        vmov.s8  q10, #128
-+         vtbl.8   d0, {d31}, d0
-+         vadd.s8  q11, q2, q10
-+         vtbl.8   d1, {d31}, d1
-+@@ -373,56 +358,68 @@ endfunc
-+         vtbl.8   d2, {d31}, d2
-+         vqadd.s8 q11, q0
-+         vtbl.8   d3, {d31}, d3
-+-        vqadd.s8   q12, q1
-+-        vsub.s8    q0, q11, q10
-+-        vsub.s8    q1, q12, q10
-++        vqadd.s8 q12, q1
-++        vsub.s8  q0, q11, q10
-++        vsub.s8  q1, q12, q10
-+         vst1.8   {q0-q1}, [r0, :128], r2
-+ .endm
-+ 
-+-.macro init_edge_32
-+-        ldr     r12, [sp, #4] // sao_offset_val_table
-+-        vld1.32 {d31}, [r12]
-+-        ldr     r12, [sp] // height
-+-.endm
-+-
-+ function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-+         init_edge_32
-+-        sub     r1, #4 // load 4 extra bytes
-++        vpush {q4-q7}
-++        sub     r1, #4
-+ 1:      subs    r12, #1
-+-        vld1.32 d3[1], [r1]!
-+-        vld1.8  {q2-q3}, [r1, :128]! // c
-+-        vld1.32 d20[0], [r1], r3
-+-        sub     r1, #36
-++        vld1.8  {q13-q14}, [r1]!
-++        vld1.32 d30, [r1], r3
-++        sub     r1, #32
-+         // a
-+-        vext.8  q0, q1, q2, #15
-+-        vext.8  q1, q2, q3, #15
-+-        // b
-+-        vext.8  q8, q2, q3, #1
-+-        vext.8  q9, q3, q10, #1
-+-        edge_w32_body
-++        vext.8   q0, q13, q14, #3
-++        vext.8   q1, q14, q15, #3
-++        vshr.u64 d24, d30, #24
-++        // c
-++        vext.8   q2, q13, q14, #4
-++        vext.8   q3, q14, q15, #4
-++        vshr.u64 d16, d30, #32
-++        // diff0
-++        diff32 q13, q14, q4, q5, q0, q1, q2, q3
-++        diff   d18, d25, d24, d16
-++        // -diff1
-++        vext.s8 q0, q13, q14, #1
-++        vext.s8 q1, q14, q9, #1
-++
-++        vsub.s8 q0, q13, q0 //diff0 + diff1
-++        vsub.s8 q1, q14, q1
-++        table32
-+         bne     1b
-++        vpop {q4-q7}
-++
-+         bx      lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-+         init_edge_32
-++        vpush {q4-q7}
-+         // load a
-+         sub     r1, r3
-+         vld1.8  {q0-q1}, [r1, :128], r3
-+         // load c
-+         vld1.8  {q2-q3}, [r1, :128], r3
-++        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
-+ 1:      subs    r12, #1
-+         // load b
-+         vld1.8  {q8-q9}, [r1, :128], r3
-+-        edge_w32_body
-+-        // inputs for next loop iteration
-+-        // a
-+-        vmov.64 q0, q2
-+-        vmov.64 q1, q3
-++        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
-++        vadd.s8 q0, q4, q12 //diff0 + diff1
-++        vadd.s8 q1, q5, q13
-++        table32
-++        // CMP ( c, a )
-++        vneg.s8 q12, q4
-++        vneg.s8 q13, q5
-+         // c
-+         vmov.64 q2, q8
-+         vmov.64 q3, q9
-+         bne     1b
-++        vpop {q4-q7}
-+         bx      lr
-+ endfunc
-+ 
-+@@ -452,7 +449,11 @@ function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-+         vext.8  q6, q10, q11, #8
-+         vext.8  q7, q11, q12, #8
-+         vext.8  q5, q10, q11, #7
-+-        edge_w32_body
-++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
-++        vadd.s8 q0, q12 //diff0 + diff1
-++        vadd.s8 q1, q13
-++        table32
-+         // inputs for next loop iteration
-+         // a
-+         vmov.8  q0, q4
-+@@ -487,7 +488,14 @@ function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-+         vext.8  q8, q10, q11, #7
-+         vext.8  q9, q11, q12, #7
-+         vext.8  q14, q12, q10, #7
-+-        edge_w32_body
-++
-++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
-++
-++        vadd.s8 q0, q12 //diff0 + diff1
-++        vadd.s8 q1, q13
-++        table32
-++
-+         // inputs for next loop iteration
-+         // a
-+         vext.8  q0, q2, q3, #1
-+-- 
-+2.5.0
-+
-diff --git a/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch b/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch
-new file mode 100644
-index 0000000..5e8e07d
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch
-@@ -0,0 +1,409 @@
-+From 29c3327a0d72a7e872ff170363cfe5ed13bca5d0 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Tue, 22 Dec 2015 18:10:24 +0000
-+Subject: [PATCH] hevcdsp: ARM NEON optimized epel functions
-+
-+---
-+ libavcodec/arm/Makefile            |   1 +
-+ libavcodec/arm/hevcdsp_epel_neon.S | 334 +++++++++++++++++++++++++++++++++++++
-+ libavcodec/arm/hevcdsp_init_neon.c |  23 +++
-+ 3 files changed, 358 insertions(+)
-+ create mode 100644 libavcodec/arm/hevcdsp_epel_neon.S
-+
-+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-+index cdd35b0..6051ec8 100644
-+--- a/libavcodec/arm/Makefile
-++++ b/libavcodec/arm/Makefile
-+@@ -131,6 +131,7 @@ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
-+                                           arm/synth_filter_neon.o
-+ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-+                                           arm/hevcdsp_deblock_neon.o    \
-++                                          arm/hevcdsp_epel_neon.o       \
-+                                           arm/hevcdsp_idct_neon.o       \
-+                                           arm/hevcdsp_qpel_neon.o
-+ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
-+diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
-+new file mode 100644
-+index 0000000..516ae5b
-+--- /dev/null
-++++ b/libavcodec/arm/hevcdsp_epel_neon.S
-+@@ -0,0 +1,334 @@
-++/*
-++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-++ *
-++ * This file is part of FFmpeg.
-++ *
-++ * FFmpeg is free software; you can redistribute it and/or
-++ * modify it under the terms of the GNU Lesser General Public
-++ * License as published by the Free Software Foundation; either
-++ * version 2.1 of the License, or (at your option) any later version.
-++ *
-++ * FFmpeg is distributed in the hope that it will be useful,
-++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-++ * Lesser General Public License for more details.
-++ *
-++ * You should have received a copy of the GNU Lesser General Public
-++ * License along with FFmpeg; if not, write to the Free Software
-++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-++ */
-++
-++#include "libavutil/arm/asm.S"
-++#include "neon.S"
-++
-++#define MAX_PB_SIZE #64
-++
-++.macro vextin_d4
-++    vld1.8    {q10}, [r1], r2
-++    vmov      d16, d20
-++    vext.8    d17, d20, d21, #1
-++    vext.8    d18, d20, d21, #2
-++    vext.8    d19, d20, d21, #3
-++.endm
-++
-++.macro vextin_d4_8
-++    vld1.8    d16, [r1], r2
-++    vext.8    d17, d16, d16, #1
-++    vext.8    d18, d16, d16, #2
-++    vext.8    d19, d16, d16, #3
-++.endm
-++
-++.macro load_coeffs_16b coeffs
-++    ldr      \coeffs, [\coeffs]
-++    vdup.i8  d0, \coeffs
-++    lsr      \coeffs, #8
-++    vdup.i8  d1, \coeffs
-++    lsr      \coeffs, #8
-++    vdup.i8  d2, \coeffs
-++    lsr      \coeffs, #8
-++    vdup.i8  d3, \coeffs
-++.endm
-++
-++.macro epel_filter_16b out=q12
-++    vmull.u8 q3, d16, d0
-++    vmull.u8 q11, d19, d3
-++    vmull.u8 \out, d17, d1
-++    vmull.u8 q10, d18, d2
-++    vadd.s16 q3, q11
-++    vadd.s16 \out, q10
-++    vsub.s16 \out, q3
-++.endm
-++
-++.macro load_coeffs_32b coeffs
-++    ldr      \coeffs, [\coeffs]
-++    vmov.i64 d4, #0
-++    vmov.8   d4[0], \coeffs
-++    lsr      \coeffs, #8
-++    vmov.8   d4[2], \coeffs
-++    lsr      \coeffs, #8
-++    vmov.8   d4[4], \coeffs
-++    lsr      \coeffs, #8
-++    vmov.8   d4[6], \coeffs
-++.endm
-++
-++.macro epel_filter_32b
-++    vmull.s16 q3, d24, d4[0] //q12
-++    vmull.s16 q4, d25, d4[0]
-++    vmull.s16 q5, d30, d4[3] //q15
-++    vmull.s16 q6, d31, d4[3]
-++
-++    vmull.s16 q7, d26, d4[1] // q13
-++    vmull.s16 q8, d27, d4[1]
-++    vmull.s16 q9, d28, d4[2] // q14
-++    vmull.s16 q10, d29, d4[2]
-++    vadd.s32 q3, q5
-++    vadd.s32 q4, q6
-++    vadd.s32 q7, q9
-++    vadd.s32 q8, q10
-++    vsub.s32 q7, q3
-++    vsub.s32 q8, q4
-++    vqshrn.s32  d6, q7, #6
-++    vqshrn.s32  d7, q8, #6
-++.endm
-++
-++.macro epel_filter_32b_4
-++    vmull.s16 q3, d24, d4[0] //q12
-++    vmull.s16 q5, d30, d4[3] //q15
-++    vmull.s16 q7, d26, d4[1] // q13
-++    vmull.s16 q9, d28, d4[2] // q14
-++    vadd.s32 q3, q5
-++    vadd.s32 q7, q9
-++    vsub.s32 q7, q3
-++    vqshrn.s32  d6, q7, #6
-++.endm
-++
-++function ff_hevc_put_epel_h_neon_8, export=1
-++        push   {r4-r7}
-++        mov    r4, MAX_PB_SIZE
-++        ldr    r7, [sp, #16] // mx
-++        ldr    r5, [sp, #24] // width
-++        sub    r7, #1
-++        lsl    r7, #2
-++        vpush {d8-d15}
-++        adrl   r12, epel_coeffs
-++        add    r7, r12
-++        sub       r1, #1
-++        lsl       r4, #1
-++        load_coeffs_16b r7
-++        mov   r12, r3
-++        mov   r6, r0
-++        mov   r7, r1
-++        cmp       r5, #6
-++        bgt       8f
-++        cmp       r5, #4
-++        blt       2f
-++        b         4f
-++8:      subs r3, #1
-++        pld [r1]
-++        vextin_d4
-++        epel_filter_16b
-++        vst1.16    {q12}, [r0], r4
-++        bne 8b
-++        subs    r5, #8
-++        beq  99f
-++        mov       r3, r12
-++        add       r6, #16
-++        mov       r0, r6
-++        add       r7, #8
-++        mov       r1, r7
-++        cmp       r5, #4
-++        bgt       8b
-++4:      subs r3, #1
-++        pld [r1]
-++        vextin_d4_8
-++        epel_filter_16b
-++        vst1.16    d24, [r0], r4
-++        bne 4b
-++        subs      r5, #4
-++        beq       99f
-++        mov       r3, r12
-++        add       r6, #8
-++        mov       r0, r6
-++        add       r7, #4
-++        mov       r1, r7
-++2:      subs r3, #1
-++        pld [r1]
-++        vextin_d4_8
-++        epel_filter_16b
-++        vst1.32    d24[0], [r0], r4
-++        bne 2b
-++99:     vpop {d8-d15}
-++        pop {r4-r7}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_put_epel_v_neon_8, export=1
-++        push   {r4-r7}
-++        mov    r4, MAX_PB_SIZE
-++        ldr    r7, [sp, #20] // my
-++        ldr    r5, [sp, #24] // width
-++        sub    r7, #1
-++        lsl    r7, #2
-++        vpush {d8-d15}
-++        adrl   r12, epel_coeffs
-++        add    r7, r12
-++        load_coeffs_16b r7
-++        sub       r1, r2
-++        lsl       r4, #1
-++        mov   r12, r3
-++        mov   r6, r0
-++        mov   r7, r1
-++0:      pld [r1]
-++        vld1.8    {d16}, [r1], r2
-++        pld [r1]
-++        vld1.8    {d17}, [r1], r2
-++        pld [r1]
-++        vld1.8    {d18}, [r1], r2
-++        cmp       r5, #6
-++        bgt       8f
-++        cmp       r5, #4
-++        blt       2f
-++        b         4f
-++8:      pld [r1]
-++        vld1.8    {d19}, [r1], r2
-++        subs r3, #1
-++        epel_filter_16b
-++        vst1.16    {q12}, [r0], r4
-++        vmov d16, d17
-++        vmov d17, d18
-++        vmov d18, d19
-++        bne 8b
-++        subs    r5, #8
-++        beq  99f
-++        mov       r3, r12
-++        add       r6, #16
-++        mov       r0, r6
-++        add       r7, #8
-++        mov       r1, r7
-++        b         0b
-++4:      pld       [r1]
-++        vld1.8    {d19}, [r1], r2
-++        subs r3, #1
-++        epel_filter_16b
-++        vst1.16    d24, [r0], r4
-++        vmov d16, d17
-++        vmov d17, d18
-++        vmov d18, d19
-++        bne 4b
-++        subs      r5, #4
-++        beq       99f
-++        mov       r3, r12
-++        add       r6, #8
-++        mov       r0, r6
-++        add       r7, #4
-++        mov       r1, r7
-++        b         0b
-++2:      pld [r1]
-++        vld1.8    {d19}, [r1], r2
-++        subs r3, #1
-++        epel_filter_16b
-++        vst1.32    d24[0], [r0], r4
-++        vmov d16, d17
-++        vmov d17, d18
-++        vmov d18, d19
-++        bne 2b
-++99:     vpop {d8-d15}
-++        pop {r4-r7}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_put_epel_hv_neon_8, export=1
-++        push   {r4-r7}
-++        mov    r4, MAX_PB_SIZE
-++        ldr    r6, [sp, #16] // mx
-++        ldr    r7, [sp, #20] // my
-++        ldr    r5, [sp, #24] // width
-++        sub    r7, #1
-++        lsl    r7, #2
-++        vpush {d8-d15}
-++        adrl   r12, epel_coeffs
-++        sub    r6, #1
-++        lsl    r6, #2
-++        add    r6, r12 // mx epel coeff offset
-++        add    r7, r12
-++        sub       r1, #1
-++        sub       r1, r2
-++        lsl       r4, #1
-++        load_coeffs_16b r6
-++        load_coeffs_32b r7
-++        mov   r12, r3
-++        mov   r6, r0
-++        mov   r7, r1
-++0:      pld   [r1]
-++        vextin_d4
-++        epel_filter_16b q12
-++        pld   [r1]
-++        vextin_d4
-++        epel_filter_16b q13
-++        pld   [r1]
-++        vextin_d4
-++        epel_filter_16b q14
-++        cmp       r5, #6
-++        bgt       8f
-++        cmp       r5, #4
-++        blt       2f
-++        b         4f
-++8:      pld     [r1]
-++        vextin_d4
-++        epel_filter_16b q15
-++        subs r3, #1
-++        epel_filter_32b
-++        vst1.16    {q3}, [r0], r4
-++        vmov q12, q13
-++        vmov q13, q14
-++        vmov q14, q15
-++        bne 8b
-++        subs    r5, #8
-++        beq  99f
-++        mov       r3, r12
-++        add       r6, #16
-++        mov       r0, r6
-++        add       r7, #8
-++        mov       r1, r7
-++        b         0b
-++4:      pld      [r1]
-++        vextin_d4_8
-++        epel_filter_16b q15
-++        subs r3, #1
-++        epel_filter_32b_4
-++        vst1.16    d6, [r0], r4
-++        vmov q12, q13
-++        vmov q13, q14
-++        vmov q14, q15
-++        bne 4b
-++        subs      r5, #4
-++        beq       99f
-++        mov       r3, r12
-++        add       r6, #8
-++        mov       r0, r6
-++        add       r7, #4
-++        mov       r1, r7
-++        b         0b
-++2:      pld      [r1]
-++        vextin_d4_8
-++        epel_filter_16b q15
-++        subs r3, #1
-++        epel_filter_32b_4
-++        vst1.32    d6[0], [r0], r4
-++        vmov q12, q13
-++        vmov q13, q14
-++        vmov q14, q15
-++        bne 2b
-++99:     vpop {d8-d15}
-++        pop {r4-r7}
-++        bx lr
-++endfunc
-++
-++epel_coeffs:
-++       .byte 2, 58, 10, 2
-++       .byte 4, 54, 16, 2
-++       .byte 6, 46, 28, 4
-++       .byte 4, 36, 36, 4
-++       .byte 4, 28, 46, 6
-++       .byte 2, 16, 54, 4
-++       .byte 2, 10, 58, 2
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 5591807..733ff08 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -58,6 +58,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
-+ PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
-+ PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
-+ #undef PUT_PIXELS
-++void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
-++                                ptrdiff_t srcstride, int height,
-++                                intptr_t mx, intptr_t my, int width);
-++void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
-++                                ptrdiff_t srcstride, int height,
-++                                intptr_t mx, intptr_t my, int width);
-++void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
-++                                ptrdiff_t srcstride, int height,
-++                                intptr_t mx, intptr_t my, int width);
-+ 
-+ static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                    int height, int width);
-+@@ -201,7 +210,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
-+             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
-+             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
-++            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
-++            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
-++            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
-+         }
-++        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
-++        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
-++        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-++        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
-++        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
-++        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
-++        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
-++        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
-++        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
-++        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
-++
-+         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
-+         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
-+         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-+-- 
-+2.5.0
-+
-
-From 51c12471695d2d06c671707a7e2e6fec3b01f538 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 7 May 2015 14:04:18 +0100
-Subject: [PATCH 62/93] [ffmpeg] Add GPU acceleration to hevc
-
----
- tools/depends/target/ffmpeg/Makefile               |     4 +-
- .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 36672 +++++++++++++++++++
- 2 files changed, 36675 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
-
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index 58ec0eb..e4acfa9 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -3,7 +3,8 @@ include FFMPEG-VERSION
- DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch \
-   0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch \
-   0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch \
--  hevcdsp_ARM_NEON_optimized_epel_functions.patch added_ARM_NEON_optimized_SAO_patches.patch
-+  hevcdsp_ARM_NEON_optimized_epel_functions.patch added_ARM_NEON_optimized_SAO_patches.patch \
-+  pfcd_hevc_optimisations.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -82,6 +83,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
- 	cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
- 	cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
-+	cd $(PLATFORM); patch -p1 < ../pfcd_hevc_optimisations.patch
- 
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
-diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
-new file mode 100644
-index 0000000..f2b8ffc
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
-@@ -0,0 +1,36672 @@
-+From 5a8f38083c6d9afec5029408c8680b2676752035 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 28 Apr 2015 16:18:40 +0100
-+Subject: [PATCH 01/68] Added display output
-+
-+---
-+ ffmpeg.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-+ 1 file changed, 159 insertions(+)
-+
-+diff --git a/ffmpeg.c b/ffmpeg.c
-+index ce54374..026ffa9 100644
-+--- a/ffmpeg.c
-++++ b/ffmpeg.c
-+@@ -23,6 +23,11 @@
-+  * multimedia converter based on the FFmpeg libraries
-+  */
-+ 
-++#ifdef RPI
-++#define RPI_DISPLAY
-++//#define RPI_ZERO_COPY
-++#endif
-++
-+ #include "config.h"
-+ #include <ctype.h>
-+ #include <string.h>
-+@@ -69,6 +74,20 @@
-+ # include "libavfilter/buffersrc.h"
-+ # include "libavfilter/buffersink.h"
-+ 
-++#ifdef RPI_DISPLAY
-++#include <bcm_host.h>
-++#include <interface/mmal/mmal.h>
-++#include <interface/mmal/mmal_parameters_camera.h>
-++#include <interface/mmal/mmal_buffer.h>
-++#include <interface/mmal/util/mmal_util.h>
-++#include <interface/mmal/util/mmal_default_components.h>
-++#include <interface/mmal/util/mmal_connection.h>
-++#include <interface/mmal/util/mmal_util_params.h>
-++#ifdef RPI_ZERO_COPY
-++#include "libavcodec/rpi_qpu.h"
-++#endif
-++#endif
-++
-+ #if HAVE_SYS_RESOURCE_H
-+ #include <sys/time.h>
-+ #include <sys/types.h>
-+@@ -161,6 +180,134 @@ static int restore_tty;
-+ static void free_input_threads(void);
-+ #endif
-+ 
-++#ifdef RPI_DISPLAY
-++
-++#define NUM_BUFFERS 4
-++
-++static MMAL_COMPONENT_T* rpi_display = NULL;
-++static MMAL_POOL_T *rpi_pool = NULL;
-++
-++#ifdef RPI_ZERO_COPY
-++static uint8_t *get_vc_handle(AVBufferRef *bref) {
-++  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++  return (uint8_t *)p->vc_handle;
-++}
-++#endif
-++
-++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
-++{
-++    MMAL_POOL_T* pool;
-++    size_t i;
-++    size_t size = (w*h*3)/2;
-++#ifdef RPI_ZERO_COPY
-++    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
-++    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
-++    assert(pool);
-++#else
-++    pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
-++
-++    for (i = 0; i < NUM_BUFFERS; ++i)
-++    {
-++       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
-++       void* bufPtr = buffer->data;
-++       memset(bufPtr, i*30, w*h);
-++       memset(bufPtr+w*h, 128, (w*h)/2);
-++    }
-++#endif
-++
-++    return pool;
-++}
-++
-++static void display_cb_input(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
-++  mmal_buffer_header_release(buffer);
-++}
-++
-++static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
-++{
-++    MMAL_COMPONENT_T* display;
-++    int w2 = (w+31)&~31;
-++    int h2 = (h+15)&~15;
-++    MMAL_DISPLAYREGION_T region =
-++    {
-++        {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
-++        .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
-++        .layer = 2,
-++        .fullscreen = 0,
-++        .dest_rect = {x, y, w, h}
-++    };
-++    bcm_host_init();  // TODO is this needed?
-++    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
-++    assert(display);
-++
-++    mmal_port_parameter_set(display->input[0], &region.hdr);
-++
-++    MMAL_ES_FORMAT_T* format = display->input[0]->format;
-++    format->encoding = MMAL_ENCODING_I420;
-++    format->es->video.width = w2;
-++    format->es->video.height = h2;
-++    format->es->video.crop.x = 0;
-++    format->es->video.crop.y = 0;
-++    format->es->video.crop.width = w;
-++    format->es->video.crop.height = h;
-++    mmal_port_format_commit(display->input[0]);
-++
-++    mmal_component_enable(display);
-++
-++    rpi_pool = display_alloc_pool(display->input[0], w2, h2);
-++
-++    mmal_port_enable(display->input[0],display_cb_input);
-++    mmal_port_enable(display->control,display_cb_input);
-++
-++    printf("Allocated display %d %d\n",w,h);
-++
-++    return display;
-++}
-++
-++static void display_frame(MMAL_COMPONENT_T* display,AVFrame* fr)
-++{
-++    int w = fr->width;
-++    int h = fr->height;
-++    int w2 = (w+31)&~31;
-++    int h2 = (h+15)&~15;
-++    if (!display || !rpi_pool)
-++        return;
-++    MMAL_BUFFER_HEADER_T* buf = mmal_queue_get(rpi_pool->queue);
-++    if (!buf) {
-++      // Running too fast so drop the frame
-++      return;
-++    }
-++    assert(buf);
-++    buf->cmd = 0;
-++    buf->length = (w2 * h2 * 3)/2;
-++    buf->offset = 0; // Offset to valid data
-++    buf->flags = 0;
-++#ifdef RPI_ZERO_COPY
-++    buf->data = get_vc_handle(fr->buf[0]);
-++    buf->alloc_size = (w2*h2*3)/2;
-++#else
-++    //mmal_buffer_header_mem_lock(buf);
-++    memcpy(buf->data, fr->data[0], w2 * h);
-++    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
-++    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
-++    //mmal_buffer_header_mem_unlock(buf);
-++#endif
-++
-++    mmal_port_send_buffer(display->input[0], buf);  // I assume this will automatically get released
-++}
-++
-++static void display_exit(MMAL_COMPONENT_T* display)
-++{
-++    if (display) {
-++        mmal_component_destroy(display);
-++    }
-++    if (rpi_pool) {
-++        mmal_port_pool_destroy(display->input[0], rpi_pool);
-++    }
-++}
-++
-++#endif
-++
-++
-+ /* sub2video hack:
-+    Convert subtitles to video with alpha to insert them in filter graphs.
-+    This is a temporary solution until libavfilter gets real subtitles support.
-+@@ -582,6 +729,10 @@ static void ffmpeg_cleanup(int ret)
-+     }
-+     term_exit();
-+     ffmpeg_exited = 1;
-++
-++#ifdef RPI_DISPLAY
-++    display_exit(rpi_display);
-++#endif
-+ }
-+ 
-+ void remove_avoptions(AVDictionary **a, AVDictionary *b)
-+@@ -965,6 +1116,14 @@ static void do_video_out(AVFormatContext *s,
-+     int frame_size = 0;
-+     InputStream *ist = NULL;
-+     AVFilterContext *filter = ost->filter->filter;
-++#ifdef RPI_DISPLAY
-++    if (next_picture)
-++    {
-++	if (!rpi_display)
-++           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
-++        display_frame(rpi_display,next_picture);
-++    }
-++#endif
-+ 
-+     if (ost->source_index >= 0)
-+         ist = input_streams[ost->source_index];
-+-- 
-+2.5.0
-+
-+
-+From a72c0e18e722b541d4bb10f1f5c966f95eccbec1 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 29 Apr 2015 16:49:43 +0100
-+Subject: [PATCH 02/68] Split transform and intra prediction into commands
-+
-+---
-+ libavcodec/hevc.c       | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
-+ libavcodec/hevc.h       |  58 +++++++++++++++++++++++
-+ libavcodec/hevc_cabac.c |  15 ++++++
-+ 3 files changed, 191 insertions(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 5f77761..5566ace 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -918,6 +918,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
-+     return 0;
-+ }
-+ 
-++#ifdef RPI
-++static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
-++{
-++    if (s->enable_rpi) {
-++        HEVCLocalContext *lc = s->HEVClc;
-++        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-++        cmd->type = RPI_PRED_INTRA;
-++        cmd->size = log2_trafo_size;
-++        cmd->c_idx = c_idx;
-++        cmd->x = x0;
-++        cmd->y = y0;
-++        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
-++        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-++    } else {
-++        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
-++    }
-++}
-++#endif
-++
-+ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                               int xBase, int yBase, int cb_xBase, int cb_yBase,
-+                               int log2_cb_size, int log2_trafo_size,
-+@@ -930,8 +949,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+     if (lc->cu.pred_mode == MODE_INTRA) {
-+         int trafo_size = 1 << log2_trafo_size;
-+         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-+-
-++#ifdef RPI
-++        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
-++#else
-+         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
-++#endif
-+     }
-+ 
-+     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-+@@ -1017,7 +1039,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-+                 if (lc->cu.pred_mode == MODE_INTRA) {
-+                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
-++#else
-+                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
-++#endif
-+                 }
-+                 if (cbf_cb[i])
-+                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-+@@ -1046,7 +1072,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-+                 if (lc->cu.pred_mode == MODE_INTRA) {
-+                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
-++#else
-+                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
-++#endif
-+                 }
-+                 if (cbf_cr[i])
-+                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-+@@ -1075,7 +1105,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                 if (lc->cu.pred_mode == MODE_INTRA) {
-+                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
-+                                                     trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
-++#else
-+                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
-++#endif
-+                 }
-+                 if (cbf_cb[i])
-+                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-+@@ -1085,7 +1119,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                 if (lc->cu.pred_mode == MODE_INTRA) {
-+                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
-+                                                 trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
-++#else
-+                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
-++#endif
-+                 }
-+                 if (cbf_cr[i])
-+                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-+@@ -1097,26 +1135,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
-+             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
-+             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
-++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
-++#else
-+             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
-+             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
-++#endif
-+             if (s->ps.sps->chroma_format_idc == 2) {
-+                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
-+                                                 trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
-++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
-++#else
-+                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
-+                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
-++#endif
-+             }
-+         } else if (blk_idx == 3) {
-+             int trafo_size_h = 1 << (log2_trafo_size + 1);
-+             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
-+             ff_hevc_set_neighbour_available(s, xBase, yBase,
-+                                             trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
-++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
-++#else
-+             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
-+             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
-++#endif
-+             if (s->ps.sps->chroma_format_idc == 2) {
-+                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
-+                                                 trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
-++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
-++#else
-+                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
-+                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
-++#endif
-+             }
-+         }
-+     }
-+@@ -2291,6 +2349,31 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
-+ }
-+ 
-++#ifdef RPI
-++static void rpi_execute_pred_cmds(HEVCContext *s)
-++{
-++  int i;
-++  HEVCPredCmd *cmd = s->univ_pred_cmds;
-++  HEVCLocalContext *lc = s->HEVClc;
-++
-++  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
-++      if (cmd->type == RPI_PRED_INTRA) {
-++          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-++          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-++          lc->na.cand_left         = (cmd->na >> 3) & 1;
-++          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
-++          lc->na.cand_up           = (cmd->na >> 1) & 1;
-++          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-++          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-++      } else {
-++          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-++      }
-++  }
-++  s->num_pred_cmds = 0;
-++  s->num_coeffs = 0;
-++}
-++#endif
-++
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ {
-+     HEVCContext *s  = avctxt->priv_data;
-+@@ -2300,6 +2383,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int y_ctb       = 0;
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-++#ifdef RPI
-++    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-++#endif
-++
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+         return AVERROR_INVALIDDATA;
-+@@ -2329,6 +2416,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-++#ifdef RPI
-++        rpi_execute_pred_cmds(s);
-++#endif
-+         if (more_data < 0) {
-+             s->tab_slice_address[ctb_addr_rs] = -1;
-+             return more_data;
-+@@ -2374,6 +2464,10 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-+     s = s1->sList[self_id];
-+     lc = s->HEVClc;
-+ 
-++#ifdef RPI
-++    s->enable_rpi = 0;
-++#endif
-++
-+     if(ctb_row) {
-+         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
-+ 
-+@@ -2998,6 +3092,13 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+ 
-+     av_freep(&s->cabac_state);
-+ 
-++#ifdef RPI
-++    av_freep(&s->unif_mv_cmds);
-++    av_freep(&s->unif_xfm_cmds);
-++    av_freep(&s->univ_pred_cmds);
-++    av_freep(&s->coeffs_buf);
-++#endif
-++
-+     for (i = 0; i < 3; i++) {
-+         av_freep(&s->sao_pixel_buffer_h[i]);
-+         av_freep(&s->sao_pixel_buffer_v[i]);
-+@@ -3057,6 +3158,22 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->HEVClcList[0] = s->HEVClc;
-+     s->sList[0] = s;
-+ 
-++#ifdef RPI
-++    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-++    if (!s->unif_mv_cmds)
-++        goto fail;
-++    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
-++    if (!s->unif_xfm_cmds)
-++        goto fail;
-++    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-++    if (!s->univ_pred_cmds)
-++        goto fail;
-++    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
-++    if (!s->coeffs_buf)
-++        goto fail;
-++    s->enable_rpi = 0;
-++#endif
-++
-+     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-+     if (!s->cabac_state)
-+         goto fail;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index d84e661..aa66b00 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -23,6 +23,9 @@
-+ #ifndef AVCODEC_HEVC_H
-+ #define AVCODEC_HEVC_H
-+ 
-++// define RPI to split the CABAC/prediction/transform into separate stages
-++#include "config.h"
-++
-+ #include "libavutil/buffer.h"
-+ #include "libavutil/md5.h"
-+ 
-+@@ -816,6 +819,49 @@ typedef struct HEVCLocalContext {
-+     int boundary_flags;
-+ } HEVCLocalContext;
-+ 
-++#ifdef RPI
-++
-++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-++#define RPI_MAX_WIDTH 2048
-++
-++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
-++#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
-++#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-++// Each block can have an intra prediction and a transform_add command
-++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-++
-++// Command for inter prediction
-++typedef struct HEVCMvCmd {
-++} HEVCMvCmd;
-++
-++// Command for transform to process a block of coefficients
-++typedef struct HEVCXfmCmd {
-++} HEVCXfmCmd;
-++
-++// Command for intra prediction and transform_add of predictions to coefficients
-++#define RPI_PRED_TRANSFORM_ADD 0
-++#define RPI_PRED_INTRA 1
-++typedef struct HEVCPredCmd {
-++    uint8_t size;
-++    uint8_t type;
-++    uint8_t na;
-++    uint8_t c_idx;
-++    union {
-++        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
-++        uint32_t x;   // RPI_PRED_INTRA
-++    };
-++    union {
-++        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
-++        uint32_t y;   // RPI_PRED_INTRA
-++    };
-++    union {
-++        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
-++        uint32_t stride;         // RPI_PRED_INTRA
-++    };
-++} HEVCPredCmd;
-++
-++#endif
-++
-+ typedef struct HEVCContext {
-+     const AVClass *c;  // needed by private avoptions
-+     AVCodecContext *avctx;
-+@@ -831,6 +877,18 @@ typedef struct HEVCContext {
-+     int                 width;
-+     int                 height;
-+ 
-++#ifdef RPI
-++    int enable_rpi;
-++    HEVCMvCmd *unif_mv_cmds;
-++    HEVCXfmCmd *unif_xfm_cmds;
-++    HEVCPredCmd *univ_pred_cmds;
-++    int16_t *coeffs_buf;
-++    int num_mv_cmds;
-++    int num_xfm_cmds;
-++    int num_pred_cmds;
-++    int num_coeffs;
-++#endif
-++
-+     uint8_t *cabac_state;
-+ 
-+     /** 1 if the independent slice segment header was successfully parsed */
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index d1bef83..c0fdfad 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1510,6 +1510,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+         }
-+     }
-++#ifdef RPI
-++    if (s->enable_rpi) {
-++        int16_t *c = s->coeffs_buf + s->num_coeffs;
-++        int n = trafo_size * trafo_size;
-++        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-++        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
-++        s->num_coeffs += n;
-++        cmd->type = RPI_PRED_TRANSFORM_ADD;
-++        cmd->size = log2_trafo_size;
-++        cmd->buf = c;
-++        cmd->dst = dst;
-++        cmd->stride = stride;
-++        return;
-++    }
-++#endif
-+     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
-+ }
-+ 
-+-- 
-+2.5.0
-+
-+
-+From f4cf5194f103463ebd84eb36f571be06ca2aa49d Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 30 Apr 2015 15:23:22 +0100
-+Subject: [PATCH 03/68] Added simple VPU test code
-+
-+---
-+ libavcodec/Makefile             |    7 +
-+ libavcodec/hevc.c               |   33 +-
-+ libavcodec/rpi_hevc_transform.h |  212 ++++++
-+ libavcodec/rpi_hevc_transform.s |  147 ++++
-+ libavcodec/rpi_mailbox.c        |  293 ++++++++
-+ libavcodec/rpi_mailbox.h        |   20 +
-+ libavcodec/rpi_qpu.c            |  652 ++++++++++++++++++
-+ libavcodec/rpi_qpu.h            |   45 ++
-+ libavcodec/rpi_shader.c         |  818 ++++++++++++++++++++++
-+ libavcodec/rpi_shader.h         |   20 +
-+ libavcodec/rpi_shader.qasm      | 1413 +++++++++++++++++++++++++++++++++++++++
-+ libavcodec/rpi_user_vcsm.h      |  425 ++++++++++++
-+ 12 files changed, 4084 insertions(+), 1 deletion(-)
-+ create mode 100644 libavcodec/rpi_hevc_transform.h
-+ create mode 100644 libavcodec/rpi_hevc_transform.s
-+ create mode 100644 libavcodec/rpi_mailbox.c
-+ create mode 100644 libavcodec/rpi_mailbox.h
-+ create mode 100644 libavcodec/rpi_qpu.c
-+ create mode 100644 libavcodec/rpi_qpu.h
-+ create mode 100644 libavcodec/rpi_shader.c
-+ create mode 100644 libavcodec/rpi_shader.h
-+ create mode 100644 libavcodec/rpi_shader.qasm
-+ create mode 100644 libavcodec/rpi_user_vcsm.h
-+
-+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-+index 5088304..54e14b4 100644
-+--- a/libavcodec/Makefile
-++++ b/libavcodec/Makefile
-+@@ -4,6 +4,10 @@ NAME = avcodec
-+ 
-+ HEADERS = avcodec.h                                                     \
-+           avfft.h                                                       \
-++          rpi_qpu.h                                                     \
-++          rpi_shader.h                                                  \
-++          rpi_mailbox.h                                                 \
-++          rpi_hevc_transform.h                                          \
-+           dv_profile.h                                                  \
-+           d3d11va.h                                                     \
-+           dxva2.h                                                       \
-+@@ -35,6 +39,9 @@ OBJS = allcodecs.o                                                      \
-+        resample.o                                                       \
-+        resample2.o                                                      \
-+        utils.o                                                          \
-++       rpi_qpu.o                                                        \
-++       rpi_shader.o                                                     \
-++       rpi_mailbox.o                                                    \
-+        vorbis_parser.o                                                  \
-+        xiph.o                                                           \
-+ 
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 5566ace..e58a3d0 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -39,6 +39,10 @@
-+ #include "golomb.h"
-+ #include "hevc.h"
-+ 
-++#ifdef RPI
-++#include "rpi_qpu.h"
-++#endif
-++
-+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ 
-+ /**
-+@@ -2417,7 +2421,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ #ifdef RPI
-+-        rpi_execute_pred_cmds(s);
-++        if (x_ctb + ctb_size >= s->ps.sps->width) {
-++            rpi_execute_pred_cmds(s);
-++        }
-+ #endif
-+         if (more_data < 0) {
-+             s->tab_slice_address[ctb_addr_rs] = -1;
-+@@ -3172,6 +3178,31 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     if (!s->coeffs_buf)
-+         goto fail;
-+     s->enable_rpi = 0;
-++
-++    // A little test program
-++    {
-++      GPU_MEM_PTR_T p;
-++      int err = gpu_malloc_cached(16, &p);
-++      short *q = (short *)p.arm;
-++      int i;
-++      int r;
-++      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
-++      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
-++      printf("Preparing data %p\n",q);
-++      for(i=0;i<16;i++)
-++        q[i] = i;
-++      printf("Flush cache\n");
-++      gpu_cache_flush(&p);
-++      printf("Executing code\n");
-++      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
-++      printf("Return value %d (",r);
-++      for(i=0;i<16;i++)
-++        printf("%d ",q[i]);
-++      printf(")\n");
-++      gpu_free(&p);
-++      goto fail; // Early out
-++    }
-++
-+ #endif
-+ 
-+     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+new file mode 100644
-+index 0000000..85a9102
-+--- /dev/null
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -0,0 +1,212 @@
-++unsigned char rpi_hevc_transform [] = {
-++169,
-++3,
-++3,
-++232,
-++128,
-++0,
-++0,
-++0,
-++20,
-++248,
-++0,
-++136,
-++0,
-++0,
-++192,
-++248,
-++0,
-++0,
-++0,
-++96,
-++3,
-++232,
-++32,
-++0,
-++0,
-++0,
-++7,
-++232,
-++0,
-++2,
-++0,
-++0,
-++8,
-++232,
-++0,
-++4,
-++0,
-++0,
-++12,
-++248,
-++0,
-++128,
-++0,
-++0,
-++192,
-++8,
-++4,
-++0,
-++4,
-++232,
-++64,
-++0,
-++0,
-++0,
-++5,
-++232,
-++0,
-++0,
-++8,
-++0,
-++128,
-++69,
-++113,
-++66,
-++12,
-++248,
-++0,
-++128,
-++0,
-++0,
-++192,
-++8,
-++4,
-++0,
-++128,
-++69,
-++113,
-++70,
-++128,
-++144,
-++39,
-++0,
-++4,
-++255,
-++48,
-++192,
-++128,
-++3,
-++32,
-++8,
-++16,
-++0,
-++76,
-++254,
-++48,
-++192,
-++9,
-++4,
-++32,
-++8,
-++0,
-++0,
-++4,
-++254,
-++0,
-++144,
-++128,
-++2,
-++0,
-++248,
-++62,
-++0,
-++128,
-++144,
-++22,
-++0,
-++4,
-++255,
-++48,
-++192,
-++128,
-++3,
-++32,
-++8,
-++16,
-++0,
-++76,
-++254,
-++48,
-++192,
-++9,
-++4,
-++32,
-++8,
-++0,
-++0,
-++140,
-++248,
-++44,
-++0,
-++0,
-++0,
-++32,
-++48,
-++4,
-++0,
-++128,
-++69,
-++113,
-++66,
-++242,
-++140,
-++211,
-++192,
-++41,
-++3,
-++68,
-++192,
-++80,
-++7,
-++164,
-++255,
-++36,
-++220,
-++96,
-++2,
-++0,
-++248,
-++62,
-++0,
-++3,
-++255,
-++55,
-++208,
-++120,
-++3,
-++224,
-++3,
-++190,
-++11,
-++16,
-++139,
-++246,
-++83,
-++0,
-++103,
-++90,
-++0,
-++8,
-++240,
-++0,
-++128,
-++128,
-++3,
-++0,
-++247,
-++32,
-++128,
-++10,
-++4,
-++136,
-++240,
-++32,
-++0,
-++128,
-++3,
-++112,
-++96,
-++90,
-++0,
-++};
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+new file mode 100644
-+index 0000000..5e2728d
-+--- /dev/null
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -0,0 +1,147 @@
-++# ******************************************************************************
-++# Argon Design Ltd.
-++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-++#
-++# Module : HEVC
-++# Author : Peter de Rivaz
-++# ******************************************************************************
-++
-++# HEVC VPU Transform
-++#
-++# Transform matrix can be thought of as
-++#   output row vector = input row vector * transMatrix2
-++#
-++# The even rows of the matrix are symmetric
-++# The odd rows of the matrix are antisymmetric
-++#
-++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-++#
-++# EXAMPLE
-++#   (a b c d) (1 2  2  1)
-++#             (3 4 -4 -3)
-++#             (5 6  6  5)
-++#             (7 8 -8 -7)
-++#
-++#  x=(a c)(1 2) = 1a+5c 2a+6c
-++#         (5 6)
-++#
-++#  y=(b d)(3 4) = 3b+7d 4b+8d
-++#         (7 8)
-++#
-++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-++#
-++#  Final results are (u , v[::-1])
-++#
-++#
-++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-++#  Apply the even matrix first and stop before rounding
-++#  Then apply the odd matrix in a full manner:
-++#
-++#   First step is to compute partial products with the first input (16 cycles)
-++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-++#   2a 4b 6c 8d
-++#   2a -4b 6c -8d
-++#   1a -3b 5c -7d
-++#
-++#   Second step is to sum partial products into final position (8 cycles)
-++#   1a+3b+5c+7d
-++#   2a+4b+6c+8d
-++#   2a-4b+6c-8d
-++#   1a-3b+5c-7d
-++#
-++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-++#
-++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-++#
-++#   For 8x8 we could compute two in parallel.
-++#
-++#
-++
-++test_add:
-++  vldh HX(0,0),(r0)
-++  vadd HX(0,0),HX(0,0),10
-++  vsth HX(0,0),(r0)
-++  mov r0,7 # return value
-++  b lr
-++
-++# Columns are transformed first
-++#
-++# Store top left half of transMatrix2 in
-++# Store bottom left half of transMatrix2 in HX(32,32)
-++#
-++# For 16x16
-++# HX(0:15,0) contains input data before transform
-++# HY(0:15,0) contains 32bit output data after transform
-++# HX(32,0) contains even rows of left half of transMatrix2
-++# HX(32,32) contains odd rows of left half of transMatrix2
-++# HY(48,0) contains partial products ready for summing
-++#
-++
-++
-++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
-++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-++# num: number of 16x16 transforms to be done
-++#
-++hevc_trans_16x16:
-++  push r6-r15, lr # TODO cut down number of used registers
-++
-++  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
-++  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-++  # Now use r0 to describe which matrix we are working on.
-++  # Allows us to prefetch the next block of coefficients for efficiency.
-++  mov r0,0 # This describes the location where we read our coefficients from
-++  mov r3,16*2 # Stride of coefficients in bytes
-++  mov r7,16*16*2 # Total block size
-++  mov r8,64*16 # Value used to swap from current to next VRF location
-++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-++  mov r4,64 # Constant used for rounding first pass
-++  mov r5,1<<19 # Constant used for rounding second pass
-++
-++  # At start of block r0,r1 point to the current block (that has already been loaded)
-++block_loop:
-++  eor r0,r8
-++  add r1,r7
-++  # Prefetch the next block
-++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-++  eor r0,r8
-++  sub r1,r7
-++
-++  # Transform the current block
-++  bl col_trans_16
-++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-++  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
-++
-++  bl col_trans_16
-++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-++
-++  # Save results - note there has been a transposition during the processing so we save columns
-++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-++
-++  # Move onto next block
-++  eor r0,r8
-++  add r1,r7
-++
-++  addcmpbgt r2,-1,0,block_loop
-++  pop r6-r15, pc
-++
-++# r1,r2,r3 r7,r8 should be preserved
-++# HX(0++,0)+r0 is the block to be transformed
-++# HX(32++,0) is the 16x16 matrix of transform coefficients
-++# Use HY(48,0) for intermediate results
-++# r0 can be used, but should be returned to its original value at the end
-++col_trans_16:
-++  add r4,r0,16 # Final value for this loop
-++col_trans_16_loop:
-++  # First compute partial products for a single column
-++  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
-++  # Then sum up the results and place back
-++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-++  addcmpblt r0,1,r4,col_trans_16_loop
-++  sub r0,16  # but r0 back to its original value
-++  b lr
-+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-+new file mode 100644
-+index 0000000..536896f
-+--- /dev/null
-++++ b/libavcodec/rpi_mailbox.c
-+@@ -0,0 +1,293 @@
-++/*
-++Copyright (c) 2012, Broadcom Europe Ltd.
-++All rights reserved.
-++
-++Redistribution and use in source and binary forms, with or without
-++modification, are permitted provided that the following conditions are met:
-++    * Redistributions of source code must retain the above copyright
-++      notice, this list of conditions and the following disclaimer.
-++    * Redistributions in binary form must reproduce the above copyright
-++      notice, this list of conditions and the following disclaimer in the
-++      documentation and/or other materials provided with the distribution.
-++    * Neither the name of the copyright holder nor the
-++      names of its contributors may be used to endorse or promote products
-++      derived from this software without specific prior written permission.
-++
-++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-++*/
-++
-++#include <stdio.h>
-++#include <string.h>
-++#include <stdlib.h>
-++#include <fcntl.h>
-++#include <unistd.h>
-++#include <assert.h>
-++#include <stdint.h>
-++#include <sys/mman.h>
-++#include <sys/ioctl.h>
-++
-++#include <linux/ioctl.h>
-++
-++#define MAJOR_NUM 100
-++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-++#define DEVICE_FILE_NAME "/dev/char_dev"
-++
-++#include "rpi_mailbox.h"
-++
-++#define PAGE_SIZE (4*1024)
-++
-++// Shared memory will not be cached in ARM cache
-++void *mapmem_shared(unsigned base, unsigned size)
-++{
-++   int mem_fd;
-++   unsigned offset = base % PAGE_SIZE;
-++   base = base - offset;
-++   /* open /dev/mem */
-++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-++      return NULL;
-++   }
-++   void *mem = mmap(
-++      0,
-++      size,
-++      PROT_READ|PROT_WRITE,
-++      MAP_SHARED/*|MAP_FIXED*/,
-++      mem_fd,
-++      base);
-++#ifdef DEBUG
-++   printf("base=0x%x, mem=%p\n", base, mem);
-++#endif
-++   if (mem == MAP_FAILED) {
-++      printf("mmap error %d\n", (int)mem);
-++      return NULL;
-++   }
-++   close(mem_fd);
-++   return (char *)mem + offset;
-++}
-++
-++// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
-++void *mapmem_private(unsigned base, unsigned size)
-++{
-++   int mem_fd;
-++   unsigned offset = base % PAGE_SIZE;
-++   base = base - offset;
-++   /* open /dev/mem */
-++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-++      return NULL;
-++   }
-++   void *mem = mmap(
-++      0,
-++      size,
-++      PROT_READ|PROT_WRITE,
-++      MAP_PRIVATE/*|MAP_FIXED*/,
-++      mem_fd,
-++      base);
-++#ifdef DEBUG
-++   printf("base=0x%x, mem=%p\n", base, mem);
-++#endif
-++   if (mem == MAP_FAILED) {
-++      printf("mmap error %d\n", (int)mem);
-++      return NULL;
-++   }
-++   close(mem_fd);
-++   return (char *)mem + offset;
-++}
-++
-++void unmapmem(void *addr, unsigned size)
-++{
-++   int s = munmap(addr, size);
-++   if (s != 0) {
-++      printf("munmap error %d\n", s);
-++      exit (-1);
-++   }
-++}
-++
-++/*
-++ * use ioctl to send mbox property message
-++ */
-++
-++static int mbox_property(int file_desc, void *buf)
-++{
-++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-++
-++   if (ret_val < 0) {
-++      printf("ioctl_set_msg failed:%d\n", ret_val);
-++   }
-++
-++#ifdef DEBUG
-++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-++   for (i=0; i<size/4; i++)
-++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-++#endif
-++   return ret_val;
-++}
-++
-++unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x3000c; // (the tag id)
-++   p[i++] = 12; // (size of the buffer)
-++   p[i++] = 12; // (size of the data)
-++   p[i++] = size; // (num bytes? or pages?)
-++   p[i++] = align; // (alignment)
-++   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned mem_free(int file_desc, unsigned handle)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x3000f; // (the tag id)
-++   p[i++] = 4; // (size of the buffer)
-++   p[i++] = 4; // (size of the data)
-++   p[i++] = handle;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned mem_lock(int file_desc, unsigned handle)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x3000d; // (the tag id)
-++   p[i++] = 4; // (size of the buffer)
-++   p[i++] = 4; // (size of the data)
-++   p[i++] = handle;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned mem_unlock(int file_desc, unsigned handle)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x3000e; // (the tag id)
-++   p[i++] = 4; // (size of the buffer)
-++   p[i++] = 4; // (size of the data)
-++   p[i++] = handle;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x30010; // (the tag id)
-++   p[i++] = 28; // (size of the buffer)
-++   p[i++] = 28; // (size of the data)
-++   p[i++] = code;
-++   p[i++] = r0;
-++   p[i++] = r1;
-++   p[i++] = r2;
-++   p[i++] = r3;
-++   p[i++] = r4;
-++   p[i++] = r5;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned qpu_enable(int file_desc, unsigned enable)
-++{
-++   int i=0;
-++   unsigned p[32];
-++
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x30012; // (the tag id)
-++   p[i++] = 4; // (size of the buffer)
-++   p[i++] = 4; // (size of the data)
-++   p[i++] = enable;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
-++   int i=0;
-++   unsigned p[32];
-++
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++   p[i++] = 0x30011; // (the tag id)
-++   p[i++] = 16; // (size of the buffer)
-++   p[i++] = 16; // (size of the data)
-++   p[i++] = num_qpus;
-++   p[i++] = control;
-++   p[i++] = noflush;
-++   p[i++] = timeout; // ms
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++int mbox_open() {
-++   int file_desc;
-++
-++   // open a char device file used for communicating with kernel mbox driver
-++   file_desc = open(DEVICE_FILE_NAME, 0);
-++   if (file_desc < 0) {
-++      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
-++      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
-++   }
-++   return file_desc;
-++}
-++
-++void mbox_close(int file_desc) {
-++  close(file_desc);
-++}
-+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-+new file mode 100644
-+index 0000000..c264d2e
-+--- /dev/null
-++++ b/libavcodec/rpi_mailbox.h
-+@@ -0,0 +1,20 @@
-++#ifndef RPI_MAILBOX_H
-++#define RPI_MAILBOX_H
-++
-++extern int mbox_open(void);
-++extern void mbox_close(int file_desc);
-++
-++extern unsigned get_version(int file_desc);
-++extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
-++extern unsigned mem_free(int file_desc, unsigned handle);
-++extern unsigned mem_lock(int file_desc, unsigned handle);
-++extern unsigned mem_unlock(int file_desc, unsigned handle);
-++extern void *mapmem_shared(unsigned base, unsigned size);
-++extern void *mapmem_private(unsigned base, unsigned size);
-++extern void unmapmem(void *addr, unsigned size);
-++
-++extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-++extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-++extern unsigned qpu_enable(int file_desc, unsigned enable);
-++
-++#endif
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+new file mode 100644
-+index 0000000..b1f50ee
-+--- /dev/null
-++++ b/libavcodec/rpi_qpu.c
-+@@ -0,0 +1,652 @@
-++#ifdef RPI
-++// Use the vcsm device for shared memory
-++// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-++#define RPI_USE_VCSM
-++#define RPI_TIME_TOTAL_QPU
-++
-++#include <stdio.h>
-++#include <stdlib.h>
-++#include <string.h>
-++#include <stddef.h>
-++#include <assert.h>
-++
-++#include "config.h"
-++
-++#include <pthread.h>
-++#include <time.h>
-++
-++#include "rpi_mailbox.h"
-++#include "rpi_qpu.h"
-++#include "rpi_shader.h"
-++#include "rpi_hevc_transform.h"
-++
-++#ifdef RPI_USE_VCSM
-++#include "rpi_user_vcsm.h"
-++#endif
-++
-++// On Pi2 there is no way to access the VPU L2 cache
-++// GPU_MEM_FLG should be 4 for uncached memory.
-++// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-++// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-++#define GPU_MEM_FLG 0xC
-++#define GPU_MEM_MAP 0x0
-++
-++#define vcos_verify(x) ((x)>=0)
-++
-++typedef unsigned char uint8_t;
-++typedef signed char int8_t;
-++typedef unsigned short uint16_t;
-++typedef unsigned int uint32_t;
-++typedef int int32_t;
-++
-++/*static const unsigned code[] =
-++{
-++  #include "rpi_shader.hex"
-++};*/
-++
-++// Size in 32bit words
-++#define QPU_CODE_SIZE 2048
-++#define VPU_CODE_SIZE 2048
-++
-++struct GPU
-++{
-++  unsigned int qpu_code[QPU_CODE_SIZE];
-++  unsigned int vpu_code[VPU_CODE_SIZE];
-++  int open_count; // Number of allocated video buffers
-++  unsigned int vc_handle; // Handle of this memory
-++  int      mb; // Mailbox handle
-++  int      vc; // Address in GPU memory
-++  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-++};
-++
-++// Stop more than one thread trying to allocate memory or use the processing resources at once
-++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-++static volatile struct GPU* gpu = NULL;
-++
-++#ifdef RPI_TIME_TOTAL_QPU
-++static unsigned int Microseconds(void) {
-++    struct timespec ts;
-++    unsigned int x;
-++    static unsigned int base = 0;
-++    clock_gettime(CLOCK_REALTIME, &ts);
-++    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
-++    if (base==0) base=x;
-++    return x-base;
-++}
-++#endif
-++
-++// Connect to QPU, returns 0 on success.
-++static int gpu_init(volatile struct GPU **gpu) {
-++  int mb = mbox_open();
-++  int vc;
-++  int handle;
-++  volatile struct GPU* ptr;
-++	if (mb < 0)
-++		return -1;
-++
-++	if (qpu_enable(mb, 1)) return -2;
-++
-++#ifdef RPI_USE_VCSM
-++  vcsm_init();
-++#endif
-++
-++  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
-++  if (!handle)
-++  {
-++    qpu_enable(mb, 0);
-++    return -3;
-++  }
-++	vc = mem_lock(mb, handle);
-++	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
-++	if (ptr == NULL)
-++	{	mem_free(mb, handle);
-++		mem_unlock(mb, handle);
-++		qpu_enable(mb, 0);
-++		return -4;
-++	}
-++
-++	ptr->mb = mb;
-++	ptr->vc_handle = handle;
-++	ptr->vc = vc;
-++
-++  *gpu = ptr;
-++
-++  // Now copy over the QPU code into GPU memory
-++  {
-++    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
-++    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-++    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-++  }
-++  // And the VPU code
-++  {
-++    int num_bytes = sizeof(rpi_hevc_transform);
-++    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-++    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-++  }
-++
-++  return 0;
-++}
-++
-++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-++static void gpu_lock(void) {
-++  pthread_mutex_lock(&gpu_mutex);
-++  if (gpu==NULL) {
-++    gpu_init(&gpu);
-++  }
-++}
-++
-++static void gpu_unlock(void) {
-++  pthread_mutex_unlock(&gpu_mutex);
-++}
-++
-++// Allocate memory on GPU
-++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-++// Returns 0 on success.
-++// This allocates memory that will not be cached in ARM's data cache.
-++// Therefore safe to use without data cache flushing.
-++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
-++  gpu_lock();
-++  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-++  p->vcsm_handle = 0;
-++  if (!p->vc_handle)
-++  {
-++    qpu_enable(gpu->mb, 0);
-++    return -3;
-++  }
-++  p->vc = mem_lock(gpu->mb, p->vc_handle);
-++  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-++  p->numbytes = numbytes;
-++  if (p->arm == NULL)
-++  {
-++    mem_free(gpu->mb, p->vc_handle);
-++    mem_unlock(gpu->mb, p->vc_handle);
-++    gpu_unlock();
-++    qpu_enable(gpu->mb, 0);
-++    return -4;
-++  }
-++  gpu->open_count++;
-++  gpu_unlock();
-++  return 0;
-++}
-++
-++void gpu_cache_flush(GPU_MEM_PTR_T *p)
-++{
-++  // This only works when using RPI_USE_VCSM
-++  void *tmp = vcsm_lock(p->vcsm_handle);
-++  vcsm_unlock_ptr(tmp);
-++}
-++
-++// This allocates data that will be
-++//    Cached in ARM L2
-++//    Uncached in VPU L2
-++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
-++  gpu_lock();
-++#ifdef RPI_USE_VCSM
-++  {
-++      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
-++      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
-++      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
-++      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
-++      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-++      p->arm = vcsm_lock(p->vcsm_handle);
-++      p->vc = mem_lock(gpu->mb, p->vc_handle);
-++  }
-++#else
-++  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-++  p->vcsm_handle = 0;
-++  if (!p->handle)
-++  {
-++    qpu_enable(gpu->mb, 0);
-++    return -3;
-++  }
-++  p->vc = mem_lock(gpu->mb, p->vc_handle);
-++  printf("This mapmem_private does not seem to work\n");
-++  exit(-1);
-++  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-++  p->numbytes = numbytes;
-++  if (p->arm == NULL)
-++  {
-++    mem_free(gpu->mb, p->handle);
-++    mem_unlock(gpu->mb, p->handle);
-++    gpu_unlock();
-++    qpu_enable(gpu->mb, 0);
-++    return -4;
-++  }
-++#endif
-++  gpu->open_count++;
-++  gpu_unlock();
-++  return 0;
-++}
-++
-++static void gpu_term(void)
-++{
-++	int mb = gpu->mb;
-++	unsigned handle = gpu->vc_handle;
-++  if (gpu==NULL)
-++    return;
-++	unmapmem((void*)gpu, sizeof(struct GPU));
-++	mem_unlock(mb, handle);
-++	mem_free(mb, handle);
-++	qpu_enable(mb, 0);
-++#ifdef RPI_USE_VCSM
-++  vcsm_exit();
-++#endif
-++	mbox_close(mb);
-++  gpu = NULL;
-++}
-++
-++void gpu_free(GPU_MEM_PTR_T *p) {
-++  int mb = gpu->mb;
-++	unsigned handle = p->vc_handle;
-++  gpu_lock();
-++#ifdef RPI_USE_VCSM
-++  if (p->vcsm_handle) {
-++      mem_unlock(mb,p->vc_handle);
-++      vcsm_unlock_ptr(p->arm);
-++      vcsm_free(p->vcsm_handle);
-++  } else {
-++	unmapmem((void*)p->arm, sizeof(struct GPU));
-++      mem_unlock(mb, handle);
-++      mem_free(mb, handle);
-++  }
-++#else
-++	unmapmem((void*)p->arm, sizeof(struct GPU));
-++	mem_unlock(mb, handle);
-++	mem_free(mb, handle);
-++#endif
-++
-++  gpu->open_count--;
-++  if (gpu->open_count==0) {
-++      printf("Closing GPU\n");
-++      gpu_term();
-++      gpu = NULL;
-++  }
-++  gpu_unlock();
-++}
-++
-++unsigned int vpu_get_fn(void) {
-++  // Make sure that the gpu is initialized
-++  if (gpu==NULL) {
-++    printf("Preparing gpu\n");
-++    gpu_lock();
-++    gpu_unlock();
-++  }
-++  return gpu->vc + offsetof(struct GPU,vpu_code);
-++}
-++
-++unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-++{
-++  unsigned r;
-++  gpu_lock();
-++  r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
-++  gpu_unlock();
-++  return r;
-++}
-++
-++// Run a program on a QPU with the given code and uniform stream (given in GPU addresses)
-++// The first num QPUs will start at code, the next num2 QPUs will start at code2
-++void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12)
-++{
-++  int i;
-++#ifdef RPI_TIME_TOTAL_QPU
-++  static int last_time=0;
-++  static long long on_time=0;
-++  static long long off_time=0;
-++  int start_time;
-++  int end_time;
-++  static int count=0;
-++#endif
-++
-++  gpu_lock();
-++#ifdef RPI_TIME_TOTAL_QPU
-++  start_time = Microseconds();
-++  if (last_time==0)
-++    last_time = start_time;
-++  off_time += start_time-last_time;
-++#endif
-++  for(i=0;i<num;i++) {
-++    gpu->mail[i*2 + 1] = code;
-++  }
-++  for(;i<num+num2;i++) {
-++    gpu->mail[i*2 + 1] = code2;
-++  }
-++  gpu->mail[0 ] = unifs1;
-++  gpu->mail[2 ] = unifs2;
-++  gpu->mail[4 ] = unifs3;
-++  gpu->mail[6 ] = unifs4;
-++  gpu->mail[8 ] = unifs5;
-++  gpu->mail[10] = unifs6;
-++	gpu->mail[12] = unifs7;
-++	gpu->mail[14] = unifs8;
-++	gpu->mail[16] = unifs9;
-++	gpu->mail[18] = unifs10;
-++	gpu->mail[20] = unifs11;
-++	gpu->mail[22] = unifs12;
-++	execute_qpu(
-++		gpu->mb,
-++		12 /* Number of QPUs */,
-++		gpu->vc + offsetof(struct GPU, mail),
-++		1 /* no flush */,  // Don't flush VPU L1 cache
-++		5000 /* timeout ms */);
-++#ifdef RPI_TIME_TOTAL_QPU
-++  end_time = Microseconds();
-++  last_time = end_time;
-++  on_time += end_time - start_time;
-++  count++;
-++  if ((count&0x7f)==0)
-++    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-++  gpu_unlock();
-++}
-++
-++unsigned int qpu_get_fn(int num) {
-++    // Make sure that the gpu is initialized
-++    unsigned int *fn;
-++    if (gpu==NULL) {
-++      printf("Preparing gpu\n");
-++      gpu_lock();
-++      gpu_unlock();
-++    }
-++    switch(num) {
-++    case QPU_MC_SETUP:
-++      fn = mc_setup;
-++      break;
-++    case QPU_MC_FILTER:
-++      fn = mc_filter;
-++      break;
-++    case QPU_MC_EXIT:
-++      fn = mc_exit;
-++      break;
-++    case QPU_MC_INTERRUPT_EXIT:
-++      fn = mc_interrupt_exit;
-++      break;
-++    case QPU_MC_FILTER_B:
-++      fn = mc_filter_b;
-++      break;
-++    case QPU_MC_FILTER_HONLY:
-++      fn = mc_filter_honly;
-++      break;
-++    case QPU_MC_SETUP_UV:
-++      fn = mc_setup_uv;
-++      break;
-++    case QPU_MC_FILTER_UV:
-++      fn = mc_filter_uv;
-++      break;
-++    case QPU_MC_FILTER_UV_B:
-++      fn = mc_filter_uv_b;
-++      break;
-++    case QPU_MC_END:
-++      fn = mc_end;
-++      break;
-++    default:
-++      printf("Unknown function\n");
-++      exit(-1);
-++    }
-++    return gpu->vc + 4*(int)(fn-rpi_shader);
-++    //return code[num] + gpu->vc;
-++}
-++
-++#if 0
-++
-++int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-++//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-++int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
-++//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-++
-++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
-++
-++static uint8_t av_clip_uint8(int32_t a)
-++{
-++    if (a&(~255)) return (-a)>>31;
-++    else          return a;
-++}
-++
-++static int32_t filter8(const uint8_t *data, int pitch)
-++{
-++   int32_t vsum = 0;
-++   int x, y;
-++
-++   for (y = 0; y < 8; y++) {
-++      int32_t hsum = 0;
-++
-++      for (x = 0; x < 8; x++)
-++         hsum += hcoeffs[x]*data[x + y * pitch];
-++
-++      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
-++   }
-++
-++   return av_clip_uint8( (vsum + 64) >> 7);
-++}
-++
-++// Note regression changes coefficients so is not thread safe
-++//#define REGRESSION
-++#ifdef REGRESSION
-++#define CMAX 100
-++#else
-++#define CMAX 2
-++#endif
-++#define YMAX 16
-++
-++int rpi_test_shader(void)
-++{
-++   int i, c;
-++
-++   uint32_t *unifs;
-++
-++   uint8_t *in_buffer;
-++   uint8_t *out_buffer[2];
-++
-++   GPU_MEM_PTR_T unifs_ptr;
-++   GPU_MEM_PTR_T in_buffer_ptr;
-++   GPU_MEM_PTR_T out_buffer_ptr[2];
-++
-++   // Addresses in GPU memory of filter programs
-++   uint32_t mc_setup = 0;
-++   uint32_t mc_filter = 0;
-++   uint32_t mc_exit = 0;
-++
-++   int pitch = 0x500;
-++
-++   if (gpu==NULL) {
-++      gpu_lock();
-++      gpu_unlock();
-++   }
-++
-++   printf("This needs to change to reflect new assembler\n");
-++   // Use table to compute locations of program start points
-++   mc_setup = code[0] + gpu->vc;
-++   mc_filter = code[1] + gpu->vc;
-++   mc_exit = code[2] + gpu->vc;
-++
-++   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-++      return -2;
-++   }
-++   unifs = (uint32_t*)unifs_ptr.arm;
-++
-++   if (!vcos_verify(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
-++      return -3;
-++   }
-++   in_buffer = (uint8_t*)in_buffer_ptr.arm;
-++
-++   if (!vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
-++      return -4;
-++   }
-++   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
-++   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
-++
-++   for (c = 0; c < CMAX; c++) {
-++      int xo[] = {rand()&31, rand()&31};
-++
-++#ifdef REGRESSION
-++      for (i = 0; i < 8; i++) {
-++         hcoeffs[i] = (int8_t)rand();
-++         vcoeffs[i] = (int8_t)rand();
-++         if (hcoeffs[i]==-128)
-++           hcoeffs[i]++;
-++         if (vcoeffs[i]==-128)
-++           vcoeffs[i]++;
-++      }
-++#endif
-++
-++      for (i = 0; i < 64*23; i++) {
-++         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
-++         in_buffer[i] = rand();
-++      }
-++
-++      // Clear output array
-++      {
-++        int b;
-++        for(b=0;b<2;b++) {
-++          for(i=0;i<16*16;i++) {
-++            out_buffer[b][i] = 3;
-++          }
-++        }
-++      }
-++
-++      unifs[0] = mc_filter;
-++      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
-++      unifs[2] = 64; // src pitch
-++      unifs[3] = pitch; // dst pitch
-++      unifs[4] = 0; // Padding
-++      unifs[5] = 0;
-++      unifs[6] = 0;
-++      unifs[7 ] = mc_filter;
-++      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
-++      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-++      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-++      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-++      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-++      unifs[13] = out_buffer_ptr[0].vc;
-++      unifs[14] = mc_exit;
-++      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
-++      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-++      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-++      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-++      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-++      unifs[20] = out_buffer_ptr[1].vc;
-++
-++      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-++
-++      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
-++
-++      //qpu_run_shader(mc_setup, unifs_ptr.vc);
-++      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
-++      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
-++      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
-++
-++      if (1)
-++      {
-++         int x, y, b;
-++         int bad = 0;
-++
-++         for (b=0; b<2; ++b)
-++            for (y=0; y<YMAX; ++y)
-++               for (x=0; x<16; ++x) {
-++                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
-++
-++                  if (out_buffer[b][x+y*pitch] != ref) {
-++                      bad = 1;
-++//                     printf("%d, %d, %d, %d\n", c, b, x, y);
-++                  }
-++#ifndef REGRESSION
-++                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
-++#endif
-++               }
-++          if (bad)
-++            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-++          else
-++            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-++      }
-++      //printf("%d\n", simpenrose_get_qpu_tick_count());
-++   }
-++
-++   gpu_free(&out_buffer_ptr[0]);
-++   gpu_free(&out_buffer_ptr[1]);
-++   gpu_free(&in_buffer_ptr);
-++   gpu_free(&unifs_ptr);
-++
-++   return 0;
-++}
-++
-++void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
-++{
-++  int x,y;
-++  for (y=0; y<16; ++y) {
-++    for (x=0; x<16; ++x) {
-++       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
-++    }
-++  }
-++}
-++
-++void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
-++{
-++   uint32_t *unifs;
-++
-++   GPU_MEM_PTR_T unifs_ptr;
-++   //uint8_t *out_buffer;
-++   //GPU_MEM_PTR_T out_buffer_ptr;
-++
-++   // Addresses in GPU memory of filter programs
-++   uint32_t mc_setup = 0;
-++   uint32_t mc_filter = 0;
-++   uint32_t mc_exit = 0;
-++   //int x,y;
-++
-++   if (gpu==NULL) {
-++      gpu_lock();
-++      gpu_unlock();
-++   }
-++
-++   // Use table to compute locations of program start points
-++   mc_setup = code[0] + gpu->vc;
-++   mc_filter = code[1] + gpu->vc;
-++   mc_exit = code[2] + gpu->vc;
-++
-++   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-++      return;
-++   }
-++   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
-++   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
-++
-++   /*for (y=0; y<16; ++y) {
-++      for (x=0; x<16; ++x) {
-++         out_buffer[x+y*dst_pitch] = 7;
-++      }
-++    }*/
-++
-++   unifs = (uint32_t*)unifs_ptr.arm;
-++
-++    unifs[0] = mc_filter;
-++    unifs[1] = (int)in_buffer_vc;
-++    unifs[2] = src_pitch; // src pitch
-++    unifs[3] = dst_pitch; // dst pitch
-++    unifs[4] = 0; // Padding
-++    unifs[5] = 0;
-++    unifs[6] = 0;
-++    unifs[7 ] = mc_exit;
-++    unifs[8 ] = (int)in_buffer_vc;
-++    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-++    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-++    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-++    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-++    unifs[13] = (int)dst_vc;
-++    //unifs[13] = (int)out_buffer_ptr.vc;
-++
-++    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-++
-++    qpu_run_shader(mc_setup, unifs_ptr.vc);
-++
-++    /*for (y=0; y<16; ++y) {
-++      for (x=0; x<16; ++x) {
-++         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
-++      }
-++    }*/
-++
-++    gpu_free(&unifs_ptr);
-++    //gpu_free(&out_buffer_ptr);
-++}
-++
-++
-++#endif
-++
-++#endif // RPI
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+new file mode 100644
-+index 0000000..4e3c35c
-+--- /dev/null
-++++ b/libavcodec/rpi_qpu.h
-+@@ -0,0 +1,45 @@
-++#ifndef RPI_QPU_H
-++#define RPI_QPU_H
-++
-++typedef struct gpu_mem_ptr_s {
-++  unsigned char *arm; // Pointer to memory mapped on ARM side
-++  int vc_handle;   // Videocore handle of relocatable memory
-++  int vcsm_handle; // Handle for use by VCSM
-++  int vc;       // Address for use in GPU code
-++  int numbytes; // Size of memory block
-++} GPU_MEM_PTR_T;
-++
-++// General GPU functions
-++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-++extern void gpu_free(GPU_MEM_PTR_T *p);
-++extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-++
-++// QPU specific functions
-++extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
-++
-++enum {
-++  QPU_MC_SETUP,
-++  QPU_MC_FILTER,
-++  QPU_MC_EXIT,
-++  QPU_MC_INTERRUPT_EXIT,
-++  QPU_MC_FILTER_B,
-++  QPU_MC_FILTER_HONLY,
-++  QPU_MC_SETUP_UV,
-++  QPU_MC_FILTER_UV,
-++  QPU_MC_FILTER_UV_B,
-++  QPU_MC_END
-++  };
-++extern unsigned int qpu_get_fn(int num);
-++
-++// VPU specific functions
-++extern unsigned int vpu_get_fn(void);
-++extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-++
-++// Simple test of shader code
-++extern int rpi_test_shader(void);
-++
-++extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
-++extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
-++
-++#endif
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+new file mode 100644
-+index 0000000..41cc2e1
-+--- /dev/null
-++++ b/libavcodec/rpi_shader.c
-+@@ -0,0 +1,818 @@
-++#include "rpi_shader.h"
-++
-++#ifdef _MSC_VER
-++   #include <stdint.h>
-++   /* cast through uintptr_t to avoid warnings */
-++   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-++#else
-++   #define POINTER_TO_UINT(X) ((unsigned int)(X))
-++#endif
-++
-++#ifdef __cplusplus
-++extern "C" { /* the types are probably wrong... */
-++#endif
-++#ifdef __cplusplus
-++}
-++#endif
-++
-++#ifdef _MSC_VER
-++__declspec(align(8))
-++#elif defined(__GNUC__)
-++__attribute__((aligned(8)))
-++#endif
-++unsigned int rpi_shader[] = {
-++// ::mc_setup
-++/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-++/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-++/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
-++/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-++/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
-++/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
-++/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
-++/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++// ::mc_filter_uv
-++/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :uvloop
-++/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter
-++/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-++/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :loop
-++/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-++/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-++/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// :fast_path
-++/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :fast_loop
-++/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-++/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-++/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-++/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-++/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-++/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-++/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-++/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-++/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-++/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-++/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-++/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-++/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-++/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-++/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-++/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_b
-++/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :bloop
-++/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-++/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-++/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-++/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_honly
-++/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-++/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-++/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :loop_honly
-++/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-++/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-++/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-++/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-++/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-++/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-++/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-++/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_exit
-++/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_exit1
-++/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit
-++/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit4
-++/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit8
-++/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_setup_uv
-++/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-++/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-++/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-++/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-++/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
-++/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-++/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-++/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-++/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++// ::mc_filter_uv_b
-++/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :uvloop_b
-++/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_end
-++};
-++#ifdef __HIGHC__
-++#pragma Align_to(8, rpi_shader)
-++#endif
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+new file mode 100644
-+index 0000000..db971f4
-+--- /dev/null
-++++ b/libavcodec/rpi_shader.h
-+@@ -0,0 +1,20 @@
-++#ifndef rpi_shader_H
-++#define rpi_shader_H
-++
-++extern unsigned int rpi_shader[];
-++
-++#define mc_setup (rpi_shader + 0)
-++#define mc_filter_uv (rpi_shader + 146)
-++#define mc_filter (rpi_shader + 360)
-++#define mc_filter_b (rpi_shader + 670)
-++#define mc_filter_honly (rpi_shader + 894)
-++#define mc_exit (rpi_shader + 1048)
-++#define mc_exit1 (rpi_shader + 1066)
-++#define mc_interrupt_exit (rpi_shader + 1082)
-++#define mc_interrupt_exit4 (rpi_shader + 1120)
-++#define mc_interrupt_exit8 (rpi_shader + 1142)
-++#define mc_setup_uv (rpi_shader + 1172)
-++#define mc_filter_uv_b (rpi_shader + 1314)
-++#define mc_end (rpi_shader + 1542)
-++
-++#endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+new file mode 100644
-+index 0000000..6851e83
-+--- /dev/null
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -0,0 +1,1413 @@
-++# register allocation
-++#
-++# ra0...ra7                                     eight horizontal filter coefficients
-++#
-++# rb1...rb7                                     seven shifted copies of the current unfiltered row
-++#
-++# ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
-++#
-++#                                               (ra15 isn't clamped to zero - this happens during the
-++#                                                copy to ra14, and during its use in the vertical filter)
-++#
-++# rb8...rb15                                    eight vertical filter coefficients
-++#
-++# ra16                                          clipped(row start address+elem_num)&~3
-++# ra17                                          per-channel shifts
-++# ra19                                          next ra17
-++#
-++# rb16                                          pitch
-++# rb17                                          height + 5
-++# rb18                                          height + 7
-++# rb19                                          next ra16
-++#
-++# ra20                                          1
-++# ra21                                          64
-++# ra22                                          256
-++# ra23                                          8
-++#
-++# rb20                                          0xffffff00
-++# rb21                                          64
-++# rb22                                          255
-++# rb23                                          24
-++#
-++# rb24                                          vdw_setup_1(dst_pitch)
-++# rb25                                          frame width-1
-++# rb26                                          height<<23 + width<<16 + vdw_setup_0
-++# rb27                                          vdw_setup_0 (depends on QPU number)
-++# rb28                                          vpm_setup (depends on QPU number)
-++# rb29                                          vdw_setup_1(dst_pitch-width)
-++# rb30                                          frame height-1
-++# rb31                                          used as temp to count loop iterations
-++#
-++# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
-++# ra24                                          clipped(row start address+8+elem_num)&~3
-++# ra25                                          per-channel shifts 2
-++# ra26                                          next ra24
-++# ra27                                          next ra25
-++# ra28                                          next y
-++# ra29                                          y for next texture access
-++#
-++# ra31                                          next kernel address
-++
-++.set rb_frame_width_minus_1,       rb25
-++.set rb_frame_height_minus_1,      rb30
-++.set rb_pitch,                     rb16
-++.set ra_x_base,                    ra16
-++.set rb_x_base_next,               rb19
-++.set ra_x2_base,                   ra24
-++.set ra_x2_base_next,              ra26
-++.set ra_xshift,                    ra17
-++
-++.set ra_x2shift,                   ra25
-++.set ra_u2v_ref_offset,            ra25
-++
-++.set ra_xshift_next,               ra19
-++
-++.set ra_x2shift_next,              ra27
-++.set ra_u2v_dst_offset,            ra27
-++
-++.set ra_y_next,                    ra28
-++.set ra_y,                         ra29
-++
-++.set rb_const_64,                  rb21
-++
-++# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
-++::mc_setup
-++
-++# Read starting kernel
-++mov ra31, unif
-++
-++# Load first request location
-++add ra_x_base, unif, elem_num # Store x
-++mov ra_y, unif # Store y
-++mov ra_x2_base, unif # Store frame base
-++
-++# Read image dimensions
-++sub rb25,unif,1
-++sub rb30,unif,1
-++
-++# get source pitch
-++mov rb16, unif
-++
-++# get destination pitch
-++mov r0, unif
-++mov r1, vdw_setup_1(0)
-++add rb24, r1, r0
-++
-++# load constants
-++
-++mov ra20, 1
-++mov ra21, 64
-++mov ra22, 256
-++mov ra23, 8
-++
-++mov rb20, 0xffffff00
-++mov rb21, 64
-++mov rb22, 255
-++mov rb23, 24
-++
-++# touch vertical context to keep simulator happy
-++
-++mov ra8, 0
-++mov ra9, 0
-++mov ra10, 0
-++mov ra11, 0
-++mov ra12, 0
-++mov ra13, 0
-++mov ra14, 0
-++mov ra15, 0
-++
-++# Compute part of VPM to use for DMA output
-++mov r2, qpu_num
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++shl r0, r0, 5
-++add rb27, r0, r1
-++
-++# Compute part of VPM to save data into
-++mov r2, qpu_num
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vpm_setup(0, 4, h8p(0, 0))
-++add rb28, r0, r1
-++
-++# Compute base address for first and second access
-++#add r0, unif, elem_num     # x
-++mov r0, ra_x_base           # Load x
-++add r2, r0, 8               # x+8
-++max r0, r0, 0; mov r1, ra_y # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-++shl ra_xshift_next, r0, 3
-++max r2, r2, 0
-++add ra_y, r1, 1
-++min r2, r2, rb_frame_width_minus_1
-++shl ra_x2shift_next, r2, 3
-++max r1, r1, 0  # y
-++min r1, r1, rb_frame_height_minus_1
-++add r0, r0, r3; mul24 r1, r1, rb_pitch
-++add r2, r2, r3
-++and r0, r0, ~3
-++and r2, r2, ~3; mov ra_x_base, r0
-++# submit texture requests for first line
-++add t0s, r0, r1 ; mov ra_x2_base, r2
-++add t0s, r2, r1
-++
-++# Dump padding words
-++mov r0, unif
-++mov r0, unif
-++
-++# submit texture requests for second line
-++max r1, ra_y, 0
-++min r1, r1, rb_frame_height_minus_1
-++add ra_y, ra_y, 1
-++bra -, ra31
-++nop ; mul24 r1, r1, rb_pitch
-++add t0s, r1, ra_x_base
-++add t0s, r1, ra_x2_base
-++
-++################################################################################
-++
-++# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter_uv
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-++shl ra_xshift_next, r0, 3
-++sub r2, unif, r3 # compute offset from frame base u to frame base v
-++add r0, r0, r3
-++and rb_x_base_next, r0, ~3
-++mov ra_y_next, r1
-++add ra_x2_base_next, rb_x_base_next, r2
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++asr rb12, r0, rb23
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:uvloop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:uvloop
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++brr.anyn -, r:uvloop
-++asr r1, r1, 15
-++min r1, r1, rb22
-++max vpm, r1, 0
-++
-++# DMA out for U
-++
-++mov vw_setup, rb26 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++# DMA out for V
-++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-++# Could potentially push this write into the start of the next pipeline stage.
-++mov r0, 16
-++mov -, vw_wait
-++
-++bra -, ra31
-++add vw_setup, rb26, r0 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++################################################################################
-++
-++
-++# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov ra_x2shift, ra_x2shift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++add r2, r0, 8 # x+8
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-++shl ra_xshift_next, r0, 3
-++max r2, r2, 0
-++min r2, r2, rb_frame_width_minus_1
-++shl ra_x2shift_next, r2, 3
-++add r0, r0, r3
-++add r2, r2, r3
-++and rb_x_base_next, r0, ~3
-++and ra_x2_base_next, r2, ~3
-++mov ra_y_next, r1
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++brr.anynn -, r:fast_path
-++asr rb12, r0, rb23  # delay slot 1
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8 # delay slot 2
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++## nop                                                                 ; ldtmu0     # loop counter increment
-++## shr r0, r4, ra17                                                    ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21         ; mul24 r3, r0, ra0
-++## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++## sub r2, r2, r3                                                    ; ldtmu0
-++##
-++## mov r0, ra22
-++## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # apply horizontal filter
-++##
-++## asr r2, r2, 15    ; mul24 r3, r0, ra0
-++## min r2, r2, rb22
-++## max ra13, r2, 0
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21
-++## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
-++## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-++## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-++## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-++## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-++## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-++## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-++## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++## sub r0, r2, r3
-++##
-++## # apply horizontal filter
-++##
-++## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-++## asr r0, r0, 15
-++## min r0, r0, rb22
-++## max ra14, r0, 0
-++##
-++##
-++##
-++##
-++## nop                                                                 ; ldtmu0     # loop counter increment
-++## shr r0, r4, ra17                                                    ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21         ; mul24 r3, r0, ra0
-++## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++## sub r0, r2, r3
-++##
-++## # apply horizontal filter
-++##
-++## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-++## asr r0, r0, 15
-++## min r0, r0, rb22
-++## max ra15, r0, 0
-++
-++
-++
-++
-++mov r3, 0
-++
-++:loop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:loop
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++brr.anyn -, r:loop
-++asr r1, r1, 15
-++min r1, r1, rb22
-++max vpm, r1, 0
-++
-++# DMA out
-++
-++bra -, ra31
-++mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-++mov vw_setup, rb29
-++mov vw_addr, unif # start the VDW
-++
-++####################################################
-++
-++:fast_path
-++## nop                                                                 ; ldtmu0     # loop counter increment
-++## shr r0, r4, ra17                                                    ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21         ; mul24 r3, r0, ra0
-++## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-++## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-++## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-++## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-++## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-++## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-++## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-++## sub r2, r2, r3                                                    ; ldtmu0
-++##
-++## mov r0, ra22
-++## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # apply horizontal filter
-++##
-++## asr r2, r2, 15    ; mul24 r3, r0, ra0
-++## min r2, r2, rb22
-++## max ra13, r2, 0
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21
-++## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-++## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-++## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-++## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-++## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-++## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-++## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-++## sub r0, r2, r3
-++##
-++## # apply horizontal filter
-++##
-++## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-++## asr r0, r0, 15
-++## min r0, r0, rb22
-++## max ra14, r0, 0
-++##
-++##
-++##
-++##
-++## nop                                                                 ; ldtmu0     # loop counter increment
-++## shr r0, r4, ra17                                                    ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21   ; mul24    r3, r0, ra0
-++## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-++## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-++## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-++## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-++## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-++## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-++## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-++## sub r0, r2, r3
-++##
-++## # apply horizontal filter
-++##
-++## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-++## asr r0, r0, 15
-++## min r0, r0, rb22
-++## max ra15, r0, 0
-++
-++
-++mov r3, 0  # This signifies the amount of unrolling
-++
-++:fast_loop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-++mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-++
-++max r2, ra_y, 0
-++min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-++sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-++sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-++sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-++sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-++sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-++sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-++sub r0, r2, r3       ; mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8       ; mov r1, ra22
-++
-++# apply horizontal filter
-++
-++brr.anyn -, r:fast_loop
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++brr.anyn -, r:fast_loop
-++asr r1, r1, 15
-++min r1, r1, rb22
-++max vpm, r1, 0
-++
-++# DMA out
-++
-++bra -, ra31
-++mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-++mov vw_setup, rb29
-++mov vw_addr, unif # start the VDW
-++
-++################################################################################
-++
-++# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter_b
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov ra_x2shift, ra_x2shift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++add r2, r0, 8 # x+8
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-++shl ra_xshift_next, r0, 3
-++max r2, r2, 0
-++min r2, r2, rb_frame_width_minus_1
-++shl ra_x2shift_next, r2, 3
-++add r0, r0, r3
-++add r2, r2, r3
-++and rb_x_base_next, r0, ~3
-++and ra_x2_base_next, r2, ~3
-++mov ra_y_next, r1
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++# r0 is currently height<<7
-++# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-++shl r3, r0, 13
-++shl r3, r3, 8 # Mask off top 8 bits
-++shr r3, r3, 8
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++# In a B frame, so also set up VPM read
-++add vr_setup, r3, rb28
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++asr rb12, r0, rb23
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++mov r3, 0
-++
-++:bloop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:bloop
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 15          ; mov -, vr_wait
-++min r1, r1, rb22
-++add r0, vpm, 1          # Blend in previous VPM contents at this location
-++brr.anyn -, r:bloop
-++max r1, r1, 0
-++add r1, r1, r0
-++shr vpm, r1, 1
-++
-++# DMA out
-++
-++bra -, ra31
-++mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-++mov vw_setup, rb29
-++mov vw_addr, unif # start the VDW
-++
-++################################################################################
-++
-++# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-++# This filter only does horizontal filtering.
-++# It is assumed that the region to fetch does not include extra rows above.
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter_honly
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov ra_x2shift, ra_x2shift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++add r2, r0, 8 # x+8
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-++shl ra_xshift_next, r0, 3
-++max r2, r2, 0
-++min r2, r2, rb_frame_width_minus_1
-++shl ra_x2shift_next, r2, 3
-++add r0, r0, r3
-++add r2, r2, r3
-++and rb_x_base_next, r0, ~3
-++and ra_x2_base_next, r2, ~3
-++mov ra_y_next, r1
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
-++shl r0, r0, 7 ; mov rb18,r0
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++mov r0, unif
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-++mov r3, 0
-++
-++:loop_honly
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3       ; mov r3, rb31
-++
-++sub.setf -, r3, rb18 ; mov r1, ra22
-++
-++mov -, vw_wait   ; mul24 r0, r0, r1
-++brr.anyn -, r:loop_honly
-++asr r0, r0, 15          # delay 1
-++min r0, r0, rb22        # delay 2
-++max vpm, r0, 0          # delay 3
-++
-++# DMA out
-++bra -, ra31
-++mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-++mov vw_setup, rb29
-++mov vw_addr, unif # start the VDW
-++
-++
-++################################################################################
-++
-++# mc_exit()
-++
-++::mc_exit
-++mov  -, vw_wait # wait on the VDW
-++
-++mov -,srel(0)
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++nop        ; nop ; thrend
-++nop        ; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++::mc_exit1
-++mov  -, vw_wait # wait on the VDW
-++
-++#mov -,srel(1)
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++# mc_interrupt_exit()
-++::mc_interrupt_exit
-++mov  -, vw_wait # wait on the VDW
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++mov -,sacq(0) # 1
-++mov -,sacq(0) # 2
-++mov -,sacq(0) # 3
-++mov -,sacq(0) # 4
-++mov -,sacq(0) # 5
-++mov -,sacq(0) # 6
-++mov -,sacq(0) # 7
-++mov -,sacq(0) # 8
-++mov -,sacq(0) # 9
-++mov -,sacq(0) # 10
-++mov -,sacq(0) # 11
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++# mc_interrupt_exit4()
-++::mc_interrupt_exit4
-++mov  -, vw_wait # wait on the VDW
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++mov -,sacq(0) # 1
-++mov -,sacq(0) # 2
-++mov -,sacq(0) # 3
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++# mc_interrupt_exit8()
-++::mc_interrupt_exit8
-++mov  -, vw_wait # wait on the VDW
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++mov -,sacq(0) # 1
-++mov -,sacq(0) # 2
-++mov -,sacq(0) # 3
-++mov -,sacq(0) # 4
-++mov -,sacq(0) # 5
-++mov -,sacq(0) # 6
-++mov -,sacq(0) # 7
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++################################################################################
-++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-++::mc_setup_uv
-++
-++# Read starting kernel
-++mov ra31, unif
-++
-++# Load first request location
-++add ra_x_base, unif, elem_num # Store x
-++mov ra_y, unif # Store y
-++mov ra_x2_base, unif # Store frame u base
-++nop
-++sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-++
-++# Read image dimensions
-++sub rb25,unif,1
-++sub rb30,unif,1
-++
-++# get source pitch
-++mov rb16, unif
-++
-++# get destination pitch
-++mov r0, unif
-++mov r1, vdw_setup_1(0)
-++add rb24, r1, r0
-++
-++# load constants
-++
-++mov ra20, 1
-++mov ra21, 64
-++mov ra22, 256
-++mov ra23, 8
-++
-++mov rb20, 0xffffff00
-++mov rb21, 64
-++mov rb22, 255
-++mov rb23, 24
-++
-++# touch vertical context to keep simulator happy
-++
-++mov ra8, 0
-++mov ra9, 0
-++mov ra10, 0
-++mov ra11, 0
-++mov ra12, 0
-++mov ra13, 0
-++mov ra14, 0
-++mov ra15, 0
-++
-++# Compute part of VPM to use for DMA output
-++mov r2, qpu_num
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++shl r0, r0, 5
-++add rb27, r0, r1
-++
-++# Compute part of VPM to save data into
-++mov r2, qpu_num
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vpm_setup(0, 4, h8p(0, 0))
-++add rb28, r0, r1
-++
-++# Compute base address for first and second access
-++mov r0, ra_x_base           # Load x
-++max r0, r0, 0; mov r1, ra_y # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++add ra_y, r1, 1
-++add r0, r0, r3
-++and r0, r0, ~3
-++max r1, r1, 0 ; mov ra_x_base, r0 # y
-++min r1, r1, rb_frame_height_minus_1
-++# submit texture requests for first line
-++add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++add t0s, r0, r1 ; mov ra_x2_base, r2
-++add t0s, r2, r1
-++
-++# Dump padding words
-++mov r0, unif
-++mov r0, unif
-++mov r0, unif
-++
-++# submit texture requests for second line
-++max r1, ra_y, 0
-++min r1, r1, rb_frame_height_minus_1
-++add ra_y, ra_y, 1
-++bra -, ra31
-++nop ; mul24 r1, r1, rb_pitch
-++add t0s, r1, ra_x_base
-++add t0s, r1, ra_x2_base
-++
-++
-++
-++################################################################################
-++
-++::mc_filter_uv_b
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-++shl ra_xshift_next, r0, 3
-++sub r2, unif, r3 # compute offset from frame base u to frame base v
-++add r0, r0, r3
-++and rb_x_base_next, r0, ~3
-++mov ra_y_next, r1
-++add ra_x2_base_next, rb_x_base_next, r2
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++
-++# r0 is currently height<<7
-++# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-++shl r3, r0, 13
-++shl r3, r3, 8 # Mask off top 8 bits
-++shr r3, r3, 8
-++
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++# In a B frame, so also set up VPM read
-++add vr_setup, r3, rb28
-++
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++asr rb12, r0, rb23
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:uvloop_b
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:uvloop_b
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 15
-++min r1, r1, rb22
-++add r0, vpm, 1          # Blend in previous VPM contents at this location
-++brr.anyn -, r:uvloop_b
-++max r1, r1, 0
-++add r1, r1, r0
-++shr vpm, r1, 1
-++
-++
-++# DMA out for U
-++
-++mov vw_setup, rb26 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++# DMA out for V
-++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-++# Could potentially push this write into the start of the next pipeline stage.
-++mov r0, 16
-++mov -, vw_wait
-++
-++bra -, ra31
-++add vw_setup, rb26, r0 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++::mc_end
-+diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-+new file mode 100644
-+index 0000000..fbebbbe
-+--- /dev/null
-++++ b/libavcodec/rpi_user_vcsm.h
-+@@ -0,0 +1,425 @@
-++/*
-++Copyright (c) 2012, Broadcom Europe Ltd
-++All rights reserved.
-++
-++Redistribution and use in source and binary forms, with or without
-++modification, are permitted provided that the following conditions are met:
-++    * Redistributions of source code must retain the above copyright
-++      notice, this list of conditions and the following disclaimer.
-++    * Redistributions in binary form must reproduce the above copyright
-++      notice, this list of conditions and the following disclaimer in the
-++      documentation and/or other materials provided with the distribution.
-++    * Neither the name of the copyright holder nor the
-++      names of its contributors may be used to endorse or promote products
-++      derived from this software without specific prior written permission.
-++
-++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-++*/
-++
-++#ifndef __USER_VCSM__H__INCLUDED__
-++#define __USER_VCSM__H__INCLUDED__
-++
-++/* VideoCore Shared Memory - user interface library.
-++**
-++** This library provides all the necessary abstraction for any application to
-++** make use of the shared memory service which is distributed accross a kernel
-++** driver and a videocore service.
-++**
-++** It is an application design decision to choose or not to use this service.
-++**
-++** The logical flow of operations that a user application needs to follow when
-++** using this service is:
-++**
-++**       1) Initialize the service.
-++**       2) Allocate shared memory blocks.
-++**       3) Start using the allocated blocks.
-++**          - In order to gain ownership on a block, lock the allocated block,
-++**            locking a block returns a valid address that the user application
-++**            can access.
-++**          - When finished with using the block for the current execution cycle
-++**            or function, and so when giving up the ownership, unlock the block.
-++**       4) A block can be locked/unlocked as many times required - within or outside
-++**          of - a specific execution context.
-++**       5) To completely release an allocated block, free it.
-++**       6) If the service is no longer required, terminate it.
-++**
-++**
-++** Some generic considerations:
-++
-++** Allocating memory blocks.
-++**
-++**   Memory blocks can be allocated in different manners depending on the cache
-++**   behavior desired.  A given block can either be:
-++
-++**       - Allocated in a non cached fashion all the way through host and videocore.
-++**       - Allocated in a cached fashion on host OR videocore.
-++**       - Allocated in a cached fashion on host AND videocore.
-++**
-++**   It is an application decision to determine how to allocate a block.  Evidently
-++**   if the application will be doing substantial read/write accesses to a given block,
-++**   it is recommended to allocate the block at least in a 'host cached' fashion for
-++**   better results.
-++**
-++**
-++** Locking memory blocks.
-++**
-++**   When the memory block has been allocated in a host cached fashion, locking the
-++**   memory block (and so taking ownership of it) will trigger a cache invalidation.
-++**
-++**   For the above reason and when using host cached allocation, it is important that
-++**   an application properly implements the lock/unlock mechanism to ensure cache will
-++**   stay coherent, otherwise there is no guarantee it will at all be.
-++**
-++**   It is possible to dynamically change the host cache behavior (ie cached or non
-++**   cached) of a given allocation without needing to free and re-allocate the block.
-++**   This feature can be useful for such application which requires access to the block
-++**   only at certain times and not otherwise.  By changing the cache behavior dynamically
-++**   the application can optimize performances for a given duration of use.
-++**   Such dynamic cache behavior remapping only applies to host cache and not videocore
-++**   cache.  If one requires to change the videocore cache behavior, then a new block
-++**   must be created to replace the old one.
-++**
-++**   On successful locking, a valid pointer is returned that the application can use
-++**   to access to data inside the block.  There is no guarantee that the pointer will
-++**   stay valid following the unlock action corresponding to this lock.
-++**
-++**
-++** Unocking memory blocks.
-++**
-++**   When the memory block has been allocated in a host cached fashion, unlocking the
-++**   memory block (and so forgiving its ownership) will trigger a cache flush unless
-++**   explicitely asked not to flush the cache for performances reasons.
-++**
-++**   For the above reason and when using host cached allocation, it is important that
-++**   an application properly implements the lock/unlock mechanism to ensure cache will
-++**   stay coherent, otherwise there is no guarantee it will at all be.
-++**
-++**
-++** A complete API is defined below.
-++*/
-++
-++#ifdef __cplusplus
-++extern "C"
-++{
-++#endif
-++
-++/* Different status that can be dumped.
-++*/
-++typedef enum
-++{
-++   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
-++                                    // Result of the walk is seen in the videocore
-++                                    // log.
-++   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
-++                                    // driver (ie for all processes).  Result of
-++                                    // the walk is seen in the kernel log.
-++   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
-++                                    // driver (for current process).  Result of
-++                                    // the walk is seen in the kernel log.
-++   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-++                                    // driver (for current process).  Result of
-++                                    // the walk is seen in the kernel log.
-++   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-++                                    // VCSM_STATUS_HOST_WALK_MAP.
-++                                    //
-++   VCSM_STATUS_NONE,                // Must be last - invalid.
-++
-++} VCSM_STATUS_T;
-++
-++/* Different kind of cache behavior.
-++*/
-++typedef enum
-++{
-++   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
-++   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
-++   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
-++   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
-++
-++} VCSM_CACHE_TYPE_T;
-++
-++/* Initialize the vcsm processing.
-++**
-++** Must be called once before attempting to do anything else.
-++**
-++** Returns 0 on success, -1 on error.
-++*/
-++int vcsm_init( void );
-++
-++
-++/* Terminates the vcsm processing.
-++**
-++** Must be called vcsm services are no longer needed, it will
-++** take care of removing any allocation under the current process
-++** control if deemed necessary.
-++*/
-++void vcsm_exit( void );
-++
-++
-++/* Queries the status of the the vcsm.
-++**
-++** Triggers dump of various kind of information, see the
-++** different variants specified in VCSM_STATUS_T.
-++**
-++** Pid is optional.
-++*/
-++void vcsm_status( VCSM_STATUS_T status, int pid );
-++
-++
-++/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-++** allocator.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** On success, the user must invoke vcsm_lock with the returned opaque
-++** handle to gain access to the memory associated with the opaque handle.
-++** When finished using the memory, the user calls vcsm_unlock_xx (see those
-++** function definition for more details on the one that can be used).
-++**
-++** A well behaved application should make every attempt to lock/unlock
-++** only for the duration it needs to access the memory data associated with
-++** the opaque handle.
-++*/
-++unsigned int vcsm_malloc( unsigned int size, char *name );
-++
-++
-++/* Allocates a cached block of memory of size 'size' via the vcsm memory
-++** allocator, the type of caching requested is passed as argument of the
-++** function call.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** On success, the user must invoke vcsm_lock with the returned opaque
-++** handle to gain access to the memory associated with the opaque handle.
-++** When finished using the memory, the user calls vcsm_unlock_xx (see those
-++** function definition for more details on the one that can be used).
-++**
-++** A well behaved application should make every attempt to lock/unlock
-++** only for the duration it needs to access the memory data associated with
-++** the opaque handle.
-++*/
-++unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-++
-++
-++/* Shares an allocated block of memory via the vcsm memory allocator.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** On success, the user must invoke vcsm_lock with the returned opaque
-++** handle to gain access to the memory associated with the opaque handle.
-++** When finished using the memory, the user calls vcsm_unlock_xx (see those
-++** function definition for more details on the one that can be used).
-++**
-++** A well behaved application should make every attempt to lock/unlock
-++** only for the duration it needs to access the memory data associated with
-++** the opaque handle.
-++*/
-++unsigned int vcsm_malloc_share( unsigned int handle );
-++
-++
-++/* Resizes a block of memory allocated previously by vcsm_alloc.
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** The handle must be unlocked by user prior to attempting any
-++** resize action.
-++**
-++** On error, the original size allocated against the handle
-++** remains available the same way it would be following a
-++** successful vcsm_malloc.
-++*/
-++int vcsm_resize( unsigned int handle, unsigned int new_size );
-++
-++
-++/* Frees a block of memory that was successfully allocated by
-++** a prior call the vcms_alloc.
-++**
-++** The handle should be considered invalid upon return from this
-++** call.
-++**
-++** Whether any memory is actually freed up or not as the result of
-++** this call will depends on many factors, if all goes well it will
-++** be freed.  If something goes wrong, the memory will likely end up
-++** being freed up as part of the vcsm_exit process.  In the end the
-++** memory is guaranteed to be freed one way or another.
-++*/
-++void vcsm_free( unsigned int handle );
-++
-++
-++/* Retrieves a videocore opaque handle from a mapped user address
-++** pointer.  The videocore handle will correspond to the actual
-++** memory mapped in videocore.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** Note: the videocore opaque handle is distinct from the user
-++**       opaque handle (allocated via vcsm_malloc) and it is only
-++**       significant for such application which knows what to do
-++**       with it, for the others it is just a number with little
-++**       use since nothing can be done with it (in particular
-++**       for safety reason it cannot be used to map anything).
-++*/
-++unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-++
-++
-++/* Retrieves a videocore opaque handle from a opaque handle
-++** pointer.  The videocore handle will correspond to the actual
-++** memory mapped in videocore.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** Note: the videocore opaque handle is distinct from the user
-++**       opaque handle (allocated via vcsm_malloc) and it is only
-++**       significant for such application which knows what to do
-++**       with it, for the others it is just a number with little
-++**       use since nothing can be done with it (in particular
-++**       for safety reason it cannot be used to map anything).
-++*/
-++unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
-++
-++
-++/* Retrieves a user opaque handle from a mapped user address
-++** pointer.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++*/
-++unsigned int vcsm_usr_handle( void *usr_ptr );
-++
-++
-++/* Retrieves a mapped user address from an opaque user
-++** handle.
-++**
-++** Returns:        0 on error
-++**                 a non-zero address on success.
-++**
-++** On success, the address corresponds to the pointer
-++** which can access the data allocated via the vcsm_malloc
-++** call.
-++*/
-++void *vcsm_usr_address( unsigned int handle );
-++
-++
-++/* Locks the memory associated with this opaque handle.
-++**
-++** Returns:        NULL on error
-++**                 a valid pointer on success.
-++**
-++** A user MUST lock the handle received from vcsm_malloc
-++** in order to be able to use the memory associated with it.
-++**
-++** On success, the pointer returned is only valid within
-++** the lock content (ie until a corresponding vcsm_unlock_xx
-++** is invoked).
-++*/
-++void *vcsm_lock( unsigned int handle );
-++
-++
-++/* Locks the memory associated with this opaque handle.  The lock
-++** also gives a chance to update the *host* cache behavior of the
-++** allocated buffer if so desired.  The *videocore* cache behavior
-++** of the allocated buffer cannot be changed by this call and such
-++** attempt will be ignored.
-++**
-++** The system will attempt to honour the cache_update mode request,
-++** the cache_result mode will provide the final answer on which cache
-++** mode is really in use.  Failing to change the cache mode will not
-++** result in a failure to lock the buffer as it is an application
-++** decision to choose what to do if (cache_result != cache_update)
-++**
-++** The value returned in cache_result can only be considered valid if
-++** the returned pointer is non NULL.  The cache_result pointer may be
-++** NULL if the application does not care about the actual outcome of
-++** its action with regards to the cache behavior change.
-++**
-++** Returns:        NULL on error
-++**                 a valid pointer on success.
-++**
-++** A user MUST lock the handle received from vcsm_malloc
-++** in order to be able to use the memory associated with it.
-++**
-++** On success, the pointer returned is only valid within
-++** the lock content (ie until a corresponding vcsm_unlock_xx
-++** is invoked).
-++*/
-++void *vcsm_lock_cache( unsigned int handle,
-++                       VCSM_CACHE_TYPE_T cache_update,
-++                       VCSM_CACHE_TYPE_T *cache_result );
-++
-++
-++/* Unlocks the memory associated with this user mapped address.
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** After unlocking a mapped address, the user should no longer
-++** attempt to reference it.
-++*/
-++int vcsm_unlock_ptr( void *usr_ptr );
-++
-++
-++/* Unlocks the memory associated with this user mapped address.
-++** Apply special processing that would override the otherwise
-++** default behavior.
-++**
-++** If 'cache_no_flush' is specified:
-++**    Do not flush cache as the result of the unlock (if cache
-++**    flush was otherwise applicable in this case).
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** After unlocking a mapped address, the user should no longer
-++** attempt to reference it.
-++*/
-++int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
-++
-++
-++/* Unlocks the memory associated with this user opaque handle.
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** After unlocking an opaque handle, the user should no longer
-++** attempt to reference the mapped addressed once associated
-++** with it.
-++*/
-++int vcsm_unlock_hdl( unsigned int handle );
-++
-++
-++/* Unlocks the memory associated with this user opaque handle.
-++** Apply special processing that would override the otherwise
-++** default behavior.
-++**
-++** If 'cache_no_flush' is specified:
-++**    Do not flush cache as the result of the unlock (if cache
-++**    flush was otherwise applicable in this case).
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** After unlocking an opaque handle, the user should no longer
-++** attempt to reference the mapped addressed once associated
-++** with it.
-++*/
-++int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-++
-++#ifdef __cplusplus
-++}
-++#endif
-++
-++#endif /* __USER_VCSM__H__INCLUDED__ */
-+-- 
-+2.5.0
-+
-+
-+From 603cf327694d2f986538f13e6b8a1d92b2a9e0b2 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@argondesign.com>
-+Date: Sat, 2 May 2015 21:15:37 +0100
-+Subject: [PATCH 04/68] First working version with uncached memory
-+
-+---
-+ libavcodec/hevc.c               |  61 +++++-
-+ libavcodec/hevc.h               |  12 +-
-+ libavcodec/hevc_cabac.c         |  39 +++-
-+ libavcodec/hevc_filter.c        |  16 ++
-+ libavcodec/hevcpred_template.c  |   6 +
-+ libavcodec/rpi_hevc_transform.h | 422 +++++++++++++++++++++++++++++++++++++++-
-+ libavcodec/rpi_hevc_transform.s | 153 +++++++++++++--
-+ libavcodec/rpi_qpu.c            |  72 +++++++
-+ libavcodec/rpi_qpu.h            |   1 +
-+ 9 files changed, 736 insertions(+), 46 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index e58a3d0..4aacb60 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -43,6 +43,8 @@
-+ #include "rpi_qpu.h"
-+ #endif
-+ 
-++// #define DISABLE_MC
-++
-+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ 
-+ /**
-+@@ -1066,11 +1068,15 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                         for (i = 0; i < (size * size); i++) {
-+                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+                         }
-++                        printf("Cross component not supported\n"); // TODO
-++                        exit(-1);
-+                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-+                     }
-+             }
-+ 
-+             if (lc->tu.cross_pf) {
-++                printf("Cross component not supported\n"); // TODO
-++                exit(-1);
-+                 hls_cross_component_pred(s, 1);
-+             }
-+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-+@@ -1099,6 +1105,8 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                         for (i = 0; i < (size * size); i++) {
-+                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+                         }
-++                        printf("Cross component not supported\n"); // TODO
-++                        exit(-1);
-+                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-+                     }
-+             }
-+@@ -1396,6 +1404,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+     int idx              = ff_hevc_pel_weight[block_w];
-+ 
-++#ifdef DISABLE_MC
-++    return;
-++#endif
-++
-+     x_off += mv->x >> 2;
-+     y_off += mv->y >> 2;
-+     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-+@@ -1466,6 +1478,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
-+     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
-+ 
-++#ifdef DISABLE_MC
-++    return;
-++#endif
-++
-+     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
-+         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
-+         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-+@@ -1551,6 +1567,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+     intptr_t _mx         = mx << (1 - hshift);
-+     intptr_t _my         = my << (1 - vshift);
-+ 
-++#ifdef DISABLE_MC
-++    return;
-++#endif
-++
-+     x_off += mv->x >> (2 + hshift);
-+     y_off += mv->y >> (2 + vshift);
-+     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-+@@ -1615,6 +1635,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
-+     int hshift = s->ps.sps->hshift[1];
-+     int vshift = s->ps.sps->vshift[1];
-+ 
-++#ifdef DISABLE_MC
-++    return;
-++#endif
-++
-+     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
-+     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
-+     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-+@@ -2354,6 +2378,22 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+ }
-+ 
-+ #ifdef RPI
-++static void rpi_execute_transform(HEVCContext *s)
-++{
-++    int i=2;
-++    //int j;
-++    //int16_t *coeffs = s->coeffs_buf_arm[i];
-++    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-++    //    s->hevcdsp.idct[4-2](coeffs, 16);
-++    //}
-++
-++    //gpu_cache_flush(&s->coeffs_buf[i]);
-++    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
-++
-++    for(i=0;i<4;i++)
-++        s->num_coeffs[i] = 0;
-++}
-++
-+ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ {
-+   int i;
-+@@ -2374,7 +2414,6 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+       }
-+   }
-+   s->num_pred_cmds = 0;
-+-  s->num_coeffs = 0;
-+ }
-+ #endif
-+ 
-+@@ -2421,7 +2460,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ #ifdef RPI
-+-        if (x_ctb + ctb_size >= s->ps.sps->width) {
-++        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
-++            rpi_execute_transform(s);
-+             rpi_execute_pred_cmds(s);
-+         }
-+ #endif
-+@@ -3102,7 +3142,9 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->unif_mv_cmds);
-+     av_freep(&s->unif_xfm_cmds);
-+     av_freep(&s->univ_pred_cmds);
-+-    av_freep(&s->coeffs_buf);
-++    for(i = 0; i < 4; i++) {
-++        gpu_free(&s->coeffs_buf[i]);
-++    }
-+ #endif
-+ 
-+     for (i = 0; i < 3; i++) {
-+@@ -3174,13 +3216,16 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+-    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
-+-    if (!s->coeffs_buf)
-+-        goto fail;
-++    for(i = 0; i < 4; i++) {
-++        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-++        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
-++        if (!s->coeffs_buf_arm[i])
-++            goto fail;
-++    }
-+     s->enable_rpi = 0;
-+ 
-+     // A little test program
-+-    {
-++    /*{
-+       GPU_MEM_PTR_T p;
-+       int err = gpu_malloc_cached(16, &p);
-+       short *q = (short *)p.arm;
-+@@ -3201,7 +3246,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+       printf(")\n");
-+       gpu_free(&p);
-+       goto fail; // Early out
-+-    }
-++    }*/
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index aa66b00..f201817 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -39,6 +39,11 @@
-+ #include "thread.h"
-+ #include "videodsp.h"
-+ 
-++// define RPI to split the CABAC/prediction/transform into separate stages
-++#ifdef RPI
-++#include "rpi_qpu.h"
-++#endif
-++
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+ #define MAX_REFS 16
-+ 
-+@@ -882,11 +887,12 @@ typedef struct HEVCContext {
-+     HEVCMvCmd *unif_mv_cmds;
-+     HEVCXfmCmd *unif_xfm_cmds;
-+     HEVCPredCmd *univ_pred_cmds;
-+-    int16_t *coeffs_buf;
-+-    int num_mv_cmds;
-++    GPU_MEM_PTR_T coeffs_buf[4];
-++    int16_t *coeffs_buf_arm[4];
-++    int num_coeffs[4];
-+     int num_xfm_cmds;
-++    int num_mv_cmds;
-+     int num_pred_cmds;
-+-    int num_coeffs;
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index c0fdfad..a7561bd 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1031,6 +1031,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     int vshift = s->ps.sps->vshift[c_idx];
-+     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-+                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
-+     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+     uint8_t significant_coeff_group_flag[8][8] = {{0}};
-+     int explicit_rdpcm_flag = 0;
-+@@ -1044,6 +1045,18 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     uint8_t dc_scale;
-+     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
-+                                          lc->tu.intra_pred_mode_c;
-++#ifdef RPI
-++    if (s->enable_rpi) {
-++        int n = trafo_size * trafo_size;
-++        if (use_vpu) {
-++            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
-++            s->num_coeffs[log2_trafo_size - 2] += n;
-++        } else {
-++            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
-++            s->num_coeffs[0] += n;
-++        }
-++    }
-++#endif
-+ 
-+     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+ 
-+@@ -1488,6 +1501,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-+             s->hevcdsp.idct_4x4_luma(coeffs);
-+         } else {
-++#ifdef RPI
-++            if (!use_vpu) {
-++              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-++              if (max_xy == 0)
-++                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-++              else {
-++                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-++                  if (max_xy < 4)
-++                      col_limit = FFMIN(4, col_limit);
-++                  else if (max_xy < 8)
-++                      col_limit = FFMIN(8, col_limit);
-++                  else if (max_xy < 12)
-++                      col_limit = FFMIN(24, col_limit);
-++
-++                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-++              }
-++            }
-++#else
-+             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+             if (max_xy == 0)
-+                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-+@@ -1501,6 +1532,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                     col_limit = FFMIN(24, col_limit);
-+                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-+             }
-++#endif
-+         }
-+     }
-+     if (lc->tu.cross_pf) {
-+@@ -1512,14 +1544,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     }
-+ #ifdef RPI
-+     if (s->enable_rpi) {
-+-        int16_t *c = s->coeffs_buf + s->num_coeffs;
-+-        int n = trafo_size * trafo_size;
-+         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-+-        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
-+-        s->num_coeffs += n;
-++        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
-+         cmd->type = RPI_PRED_TRANSFORM_ADD;
-+         cmd->size = log2_trafo_size;
-+-        cmd->buf = c;
-++        cmd->buf = coeffs;
-+         cmd->dst = dst;
-+         cmd->stride = stride;
-+         return;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 1f33b0c..e4c3da7 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -22,6 +22,10 @@
-+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+  */
-+ 
-++//#define DISABLE_SAO
-++//#define DISABLE_DEBLOCK
-++//#define DISABLE_STRENGTHS
-++
-+ #include "libavutil/common.h"
-+ #include "libavutil/internal.h"
-+ 
-+@@ -273,6 +277,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
-+     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
-+     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
-+ 
-++#ifdef DISABLE_SAO
-++    return;
-++#endif
-++
-+     if (restore) {
-+         if (!edges[0]) {
-+             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-+@@ -496,6 +504,10 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                 s->ps.sps->pcm.loop_filter_disable_flag) ||
-+                s->ps.pps->transquant_bypass_enable_flag;
-+ 
-++#ifdef DISABLE_DEBLOCK
-++    return;
-++#endif
-++
-+     if (x0) {
-+         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
-+         left_beta_offset = s->deblock[ctb - 1].beta_offset;
-+@@ -726,6 +738,10 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+     int boundary_upper, boundary_left;
-+     int i, j, bs;
-+ 
-++#ifdef DISABLE_STRENGTHS
-++    return;
-++#endif
-++
-+     boundary_upper = y0 > 0 && !(y0 & 7);
-+     if (boundary_upper &&
-+         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-+index 6ae87cc..71c6d52 100644
-+--- a/libavcodec/hevcpred_template.c
-++++ b/libavcodec/hevcpred_template.c
-+@@ -20,6 +20,8 @@
-+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+  */
-+ 
-++//#define DISABLE_INTRA
-++
-+ #include "libavutil/pixdesc.h"
-+ 
-+ #include "bit_depth_template.c"
-+@@ -114,6 +116,10 @@ do {                                  \
-+     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
-+                            (x0 + size_in_luma_h)) >> hshift;
-+ 
-++#ifdef DISABLE_INTRA
-++    return;
-++#endif
-++
-+     if (s->ps.pps->constrained_intra_pred_flag == 1) {
-+         int size_in_luma_pu_v = PU(size_in_luma_v);
-+         int size_in_luma_pu_h = PU(size_in_luma_h);
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index 85a9102..c0c279f 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -3,11 +3,11 @@ unsigned char rpi_hevc_transform [] = {
-+ 3,
-+ 3,
-+ 232,
-+-128,
-++32,
-+ 0,
-+ 0,
-+ 0,
-+-20,
-++12,
-+ 248,
-+ 0,
-+ 136,
-+@@ -56,9 +56,9 @@ unsigned char rpi_hevc_transform [] = {
-+ 5,
-+ 232,
-+ 0,
-+-0,
-+ 8,
-+ 0,
-++0,
-+ 128,
-+ 69,
-+ 113,
-+@@ -108,8 +108,8 @@ unsigned char rpi_hevc_transform [] = {
-+ 128,
-+ 2,
-+ 0,
-+-248,
-+-62,
-++8,
-++2,
-+ 0,
-+ 128,
-+ 144,
-+@@ -123,13 +123,13 @@ unsigned char rpi_hevc_transform [] = {
-+ 3,
-+ 32,
-+ 8,
-+-16,
-++20,
-+ 0,
-+ 76,
-+ 254,
-+ 48,
-+ 192,
-+-9,
-++4,
-+ 4,
-+ 32,
-+ 8,
-+@@ -155,14 +155,46 @@ unsigned char rpi_hevc_transform [] = {
-+ 192,
-+ 41,
-+ 3,
-+-68,
-++70,
-++192,
-++80,
-++7,
-++164,
-++255,
-++36,
-++204,
-++96,
-++2,
-++0,
-++248,
-++62,
-++0,
-++3,
-++255,
-++55,
-++208,
-++120,
-++3,
-++224,
-++3,
-++190,
-++11,
-++16,
-++139,
-++246,
-++91,
-++0,
-++103,
-++90,
-++0,
-++70,
-+ 192,
-+ 80,
-+ 7,
-+ 164,
-+ 255,
-+ 36,
-+-220,
-++204,
-+ 96,
-+ 2,
-+ 0,
-+@@ -182,7 +214,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 16,
-+ 139,
-+ 246,
-+-83,
-++91,
-+ 0,
-+ 103,
-+ 90,
-+@@ -209,4 +241,374 @@ unsigned char rpi_hevc_transform [] = {
-+ 96,
-+ 90,
-+ 0,
-++169,
-++3,
-++3,
-++232,
-++32,
-++0,
-++0,
-++0,
-++12,
-++248,
-++0,
-++136,
-++0,
-++0,
-++192,
-++248,
-++0,
-++0,
-++64,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++248,
-++0,
-++168,
-++0,
-++0,
-++192,
-++248,
-++0,
-++0,
-++3,
-++232,
-++128,
-++0,
-++0,
-++0,
-++7,
-++232,
-++0,
-++2,
-++0,
-++0,
-++4,
-++232,
-++64,
-++0,
-++0,
-++0,
-++5,
-++232,
-++0,
-++8,
-++0,
-++0,
-++57,
-++239,
-++224,
-++247,
-++255,
-++255,
-++72,
-++192,
-++95,
-++207,
-++88,
-++122,
-++88,
-++124,
-++137,
-++64,
-++26,
-++64,
-++161,
-++64,
-++152,
-++64,
-++128,
-++144,
-++31,
-++0,
-++72,
-++232,
-++32,
-++0,
-++0,
-++0,
-++65,
-++232,
-++32,
-++0,
-++0,
-++0,
-++128,
-++144,
-++23,
-++0,
-++145,
-++64,
-++168,
-++64,
-++128,
-++144,
-++19,
-++0,
-++72,
-++232,
-++32,
-++0,
-++0,
-++0,
-++65,
-++232,
-++32,
-++0,
-++0,
-++0,
-++128,
-++144,
-++11,
-++0,
-++74,
-++232,
-++0,
-++8,
-++0,
-++0,
-++242,
-++140,
-++229,
-++192,
-++57,
-++239,
-++32,
-++8,
-++0,
-++0,
-++41,
-++3,
-++12,
-++248,
-++0,
-++128,
-++0,
-++0,
-++192,
-++8,
-++4,
-++0,
-++12,
-++248,
-++0,
-++132,
-++64,
-++0,
-++192,
-++8,
-++4,
-++0,
-++0,
-++96,
-++255,
-++159,
-++131,
-++255,
-++0,
-++232,
-++0,
-++4,
-++0,
-++0,
-++255,
-++159,
-++142,
-++255,
-++4,
-++255,
-++48,
-++204,
-++16,
-++3,
-++224,
-++251,
-++62,
-++0,
-++5,
-++255,
-++51,
-++204,
-++128,
-++3,
-++224,
-++251,
-++16,
-++0,
-++77,
-++254,
-++51,
-++204,
-++9,
-++4,
-++224,
-++251,
-++0,
-++0,
-++128,
-++64,
-++6,
-++232,
-++64,
-++0,
-++0,
-++0,
-++140,
-++248,
-++47,
-++0,
-++0,
-++0,
-++224,
-++99,
-++0,
-++0,
-++4,
-++254,
-++0,
-++144,
-++128,
-++2,
-++0,
-++8,
-++2,
-++0,
-++32,
-++247,
-++240,
-++207,
-++16,
-++3,
-++32,
-++247,
-++176,
-++207,
-++17,
-++3,
-++32,
-++247,
-++112,
-++207,
-++18,
-++3,
-++32,
-++247,
-++48,
-++207,
-++19,
-++3,
-++32,
-++247,
-++240,
-++206,
-++20,
-++3,
-++32,
-++247,
-++176,
-++206,
-++21,
-++3,
-++32,
-++247,
-++112,
-++206,
-++22,
-++3,
-++32,
-++247,
-++48,
-++206,
-++23,
-++3,
-++32,
-++247,
-++240,
-++205,
-++24,
-++3,
-++32,
-++247,
-++176,
-++205,
-++25,
-++3,
-++32,
-++247,
-++112,
-++205,
-++26,
-++3,
-++32,
-++247,
-++48,
-++205,
-++27,
-++3,
-++32,
-++247,
-++240,
-++204,
-++28,
-++3,
-++32,
-++247,
-++176,
-++204,
-++29,
-++3,
-++32,
-++247,
-++112,
-++204,
-++30,
-++3,
-++32,
-++247,
-++48,
-++204,
-++31,
-++3,
-++5,
-++255,
-++51,
-++204,
-++128,
-++3,
-++224,
-++251,
-++16,
-++0,
-++77,
-++254,
-++51,
-++204,
-++9,
-++4,
-++224,
-++251,
-++0,
-++0,
-++0,
-++237,
-++0,
-++4,
-++0,
-++0,
-++140,
-++248,
-++47,
-++0,
-++0,
-++0,
-++224,
-++99,
-++0,
-++0,
-++90,
-++0,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index 5e2728d..1e389c7 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -58,13 +58,6 @@
-+ #
-+ #
-+ 
-+-test_add:
-+-  vldh HX(0,0),(r0)
-+-  vadd HX(0,0),HX(0,0),10
-+-  vsth HX(0,0),(r0)
-+-  mov r0,7 # return value
-+-  b lr
-+-
-+ # Columns are transformed first
-+ #
-+ # Store top left half of transMatrix2 in
-+@@ -79,7 +72,7 @@ test_add:
-+ #
-+ 
-+ 
-+-# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
-++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+ # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+ # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+ # num: number of 16x16 transforms to be done
-+@@ -87,17 +80,17 @@ test_add:
-+ hevc_trans_16x16:
-+   push r6-r15, lr # TODO cut down number of used registers
-+ 
-+-  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
-+-  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-++  mov r3, 16*2 # Stride of transMatrix2 in bytes
-++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+   # Now use r0 to describe which matrix we are working on.
-+   # Allows us to prefetch the next block of coefficients for efficiency.
-+   mov r0,0 # This describes the location where we read our coefficients from
-+-  mov r3,16*2 # Stride of coefficients in bytes
-++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+   mov r7,16*16*2 # Total block size
-+   mov r8,64*16 # Value used to swap from current to next VRF location
-+   vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+   mov r4,64 # Constant used for rounding first pass
-+-  mov r5,1<<19 # Constant used for rounding second pass
-++  mov r5,1<<11 # Constant used for rounding second pass
-+ 
-+   # At start of block r0,r1 point to the current block (that has already been loaded)
-+ block_loop:
-+@@ -113,12 +106,12 @@ block_loop:
-+   vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+   #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+   vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+-  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
-++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+ 
-+   bl col_trans_16
-+-  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+-  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+-  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-++  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+ 
-+   # Save results - note there has been a transposition during the processing so we save columns
-+   vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+@@ -132,16 +125,136 @@ block_loop:
-+ 
-+ # r1,r2,r3 r7,r8 should be preserved
-+ # HX(0++,0)+r0 is the block to be transformed
-+-# HX(32++,0) is the 16x16 matrix of transform coefficients
-++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+ # Use HY(48,0) for intermediate results
-+ # r0 can be used, but should be returned to its original value at the end
-+ col_trans_16:
-+-  add r4,r0,16 # Final value for this loop
-++  add r6,r0,16 # Final value for this loop
-+ col_trans_16_loop:
-+   # First compute partial products for a single column
-+-  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
-++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+   # Then sum up the results and place back
-+   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+-  addcmpblt r0,1,r4,col_trans_16_loop
-++  addcmpblt r0,1,r6,col_trans_16_loop
-+   sub r0,16  # but r0 back to its original value
-+   b lr
-++
-++col_trans_odd_16:
-++  add r6,r0,16 # Final value for this loop
-++col_trans_odd_16_loop:
-++  # First compute partial products for a single column
-++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-++  # Then sum up the results and place back
-++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-++  addcmpblt r0,1,r6,col_trans_odd_16_loop
-++  sub r0,16  # but r0 back to its original value
-++  b lr
-++
-++
-++test_add:
-++  vldh HX(0,0),(r0)
-++  vadd HX(0,0),HX(0,0),10
-++  vsth HX(0,0),(r0)
-++  mov r0,7 # return value
-++  b lr
-++
-++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-++# num: number of 16x16 transforms to be done
-++#
-++hevc_trans_32x32:
-++  push r6-r15, lr # TODO cut down number of used registers
-++
-++  # Fetch transform matrices
-++  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-++  add r0, 16*16*2
-++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-++
-++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-++  mov r7, 16*16*2 # Total block size
-++  mov r4, 64 # Constant used for rounding first pass
-++  mov r5, 1<<11 # Constant used for rounding second pass
-++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-++  # set r8 to 32byte aligned stack pointer
-++  add r8,sp,31
-++  lsr r8,5
-++  lsl r8,5
-++  mov r9,r8  # Backup of the temporary storage
-++  mov r10,r1 # Backup of the coefficient buffer
-++block_loop32:
-++
-++  # COLUMN TRANSFORM
-++  # Transform the first 16 columns
-++  mov r1,r10  # Input Coefficient buffer
-++  mov r8,r9   # Output temporary storage
-++  bl trans32
-++  # Transform the second 16 columns
-++  add r8,32
-++  add r1,32
-++  bl trans32
-++
-++  # ROW TRANSFORM
-++  mov r1,r9  # Input temporary storage
-++  mov r8,r10   # Output Coefficient buffer
-++  bl trans32
-++  # Transform the second 16 columns
-++  add r8,32
-++  add r1,32
-++  bl trans32
-++
-++  add r10, 32*32*2 # move onto next block of coefficients
-++  addcmpbgt r2,-1,0,block_loop32
-++
-++  add sp,sp,32*32*2+32 # Restore stack
-++
-++  pop r6-r15, pc
-++
-++trans32:
-++  # We can no longer afford the VRF space to do prefetching when doing 32x32
-++  # Fetch the even rows
-++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-++  # Fetch the odd rows
-++  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-++
-++  # Transform the even rows using even matrix
-++  mov r0, 0 # Even rows
-++  bl col_trans_16
-++
-++  # Now transform the odd rows using odd matrix
-++  mov r0, 64*16 # Odd rows
-++  bl col_trans_odd_16
-++
-++  # Now apply butterfly to compute the first 16 results
-++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-++  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-++  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-++  # 16bit results now in HX(48,32)
-++  mov r0,r8
-++  mov r6,32*2
-++  vsth VX(48,32++),(r0+=r6) REP 16
-++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
-++
-++  # Now apply butterfly to compute the second 16 results (in reverse order)
-++  vsub HY(63,0),HY(0,0),HY(16,0)
-++  vsub HY(62,0),HY(0,0),HY(17,0)
-++  vsub HY(61,0),HY(0,0),HY(18,0)
-++  vsub HY(60,0),HY(0,0),HY(19,0)
-++  vsub HY(59,0),HY(0,0),HY(20,0)
-++  vsub HY(58,0),HY(0,0),HY(21,0)
-++  vsub HY(57,0),HY(0,0),HY(22,0)
-++  vsub HY(56,0),HY(0,0),HY(23,0)
-++  vsub HY(55,0),HY(0,0),HY(24,0)
-++  vsub HY(54,0),HY(0,0),HY(25,0)
-++  vsub HY(53,0),HY(0,0),HY(26,0)
-++  vsub HY(52,0),HY(0,0),HY(27,0)
-++  vsub HY(51,0),HY(0,0),HY(28,0)
-++  vsub HY(50,0),HY(0,0),HY(29,0)
-++  vsub HY(49,0),HY(0,0),HY(30,0)
-++  vsub HY(48,0),HY(0,0),HY(31,0)
-++  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-++  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-++  add r0,r8,16*32*2 # Move to 16th row
-++  vsth VX(48,32++),(r0+=r6) REP 16
-++  b lr
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index b1f50ee..d720546 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -3,6 +3,7 @@
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+ #define RPI_USE_VCSM
-+ #define RPI_TIME_TOTAL_QPU
-++#define RPI_TIME_TOTAL_VPU
-+ 
-+ #include <stdio.h>
-+ #include <stdlib.h>
-+@@ -48,10 +49,47 @@ typedef int int32_t;
-+ #define QPU_CODE_SIZE 2048
-+ #define VPU_CODE_SIZE 2048
-+ 
-++const short rpi_transMatrix2even[32][16] = { // Even rows first
-++{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
-++{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
-++{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
-++{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
-++{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
-++{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
-++{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
-++{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
-++{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
-++{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
-++{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
-++{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
-++{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
-++{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
-++{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
-++{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
-++// Odd rows
-++{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
-++{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-++{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
-++{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
-++{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
-++{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
-++{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
-++{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
-++{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
-++{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
-++{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
-++{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
-++{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
-++{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
-++{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
-++{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
-++};
-++
-+ struct GPU
-+ {
-+   unsigned int qpu_code[QPU_CODE_SIZE];
-+   unsigned int vpu_code[VPU_CODE_SIZE];
-++  short transMatrix2even[16*16];
-+   int open_count; // Number of allocated video buffers
-+   unsigned int vc_handle; // Handle of this memory
-+   int      mb; // Mailbox handle
-+@@ -123,6 +161,8 @@ static int gpu_init(volatile struct GPU **gpu) {
-+     assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-+   }
-++  // And the transform coefficients
-++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
-+ 
-+   return 0;
-+ }
-+@@ -274,11 +314,43 @@ unsigned int vpu_get_fn(void) {
-+   return gpu->vc + offsetof(struct GPU,vpu_code);
-+ }
-+ 
-++unsigned int vpu_get_constants(void) {
-++  if (gpu==NULL) {
-++    gpu_lock();
-++    gpu_unlock();
-++  }
-++  return gpu->vc + offsetof(struct GPU,transMatrix2even);
-++}
-++
-+ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-+ {
-+   unsigned r;
-++#ifdef RPI_TIME_TOTAL_VPU
-++  static int last_time=0;
-++  static long long on_time=0;
-++  static long long off_time=0;
-++  int start_time;
-++  int end_time;
-++  static int count=0;
-++  static long long countr2=0;
-++#endif
-+   gpu_lock();
-++#ifdef RPI_TIME_TOTAL_VPU
-++  start_time = Microseconds();
-++  if (last_time==0)
-++    last_time = start_time;
-++  off_time += start_time-last_time;
-++#endif
-+   r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
-++#ifdef RPI_TIME_TOTAL_VPU
-++  end_time = Microseconds();
-++  last_time = end_time;
-++  on_time += end_time - start_time;
-++  count++;
-++  countr2 += r2;
-++  if ((count&0x7f)==0)
-++    printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-+   gpu_unlock();
-+   return r;
-+ }
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 4e3c35c..814fc3c 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -34,6 +34,7 @@ extern unsigned int qpu_get_fn(int num);
-+ 
-+ // VPU specific functions
-+ extern unsigned int vpu_get_fn(void);
-++extern unsigned int vpu_get_constants(void);
-+ extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+ 
-+ // Simple test of shader code
-+-- 
-+2.5.0
-+
-+
-+From 1f1b223bd911a88726aa2c2f56334b15b421d7fa Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 5 May 2015 09:41:23 +0100
-+Subject: [PATCH 05/68] Fixed deblocking
-+
-+---
-+ libavcodec/hevc.c | 20 +++++++++++++++++---
-+ 1 file changed, 17 insertions(+), 3 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 4aacb60..94fdda6 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2387,8 +2387,9 @@ static void rpi_execute_transform(HEVCContext *s)
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-    //gpu_cache_flush(&s->coeffs_buf[i]);
-++    gpu_cache_flush(&s->coeffs_buf[i]);
-+     vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
-++    gpu_cache_flush(&s->coeffs_buf[i]);
-+ 
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+@@ -2427,6 +2428,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-+ #ifdef RPI
-++    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-+     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-+ #endif
-+ 
-+@@ -2460,9 +2462,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ #ifdef RPI
-+-        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
-++        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-++            int x;
-++            // Transform all blocks
-+             rpi_execute_transform(s);
-++            // Perform intra prediction and residual reconstruction
-+             rpi_execute_pred_cmds(s);
-++            // Perform deblocking for CTBs in this row
-++            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
-++                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
-++            }
-++            start_ctb_x = 0;
-+         }
-+ #endif
-+         if (more_data < 0) {
-+@@ -2473,6 +2483,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         ctb_addr_ts++;
-+         ff_hevc_save_states(s, ctb_addr_ts);
-++#ifdef RPI
-++        if (s->enable_rpi)
-++            continue;
-++#endif
-+         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
-+     }
-+ 
-+@@ -3217,7 +3231,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+     for(i = 0; i < 4; i++) {
-+-        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-++        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-+         s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
-+         if (!s->coeffs_buf_arm[i])
-+             goto fail;
-+-- 
-+2.5.0
-+
-+
-+From a32f8972fedc38dcf887f8f2899e8843efd6324a Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 5 May 2015 11:32:30 +0100
-+Subject: [PATCH 06/68] Added 32x32 transform
-+
-+---
-+ libavcodec/hevc.c               |   8 +-
-+ libavcodec/hevc_cabac.c         |   4 +-
-+ libavcodec/rpi_hevc_transform.h | 200 +++++++++++++++++-----------------------
-+ libavcodec/rpi_hevc_transform.s | 102 ++++++++++----------
-+ libavcodec/rpi_qpu.c            |   4 +-
-+ 5 files changed, 148 insertions(+), 170 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 94fdda6..fbbd30f 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2387,9 +2387,11 @@ static void rpi_execute_transform(HEVCContext *s)
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-    gpu_cache_flush(&s->coeffs_buf[i]);
-+-    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
-+-    gpu_cache_flush(&s->coeffs_buf[i]);
-++    gpu_cache_flush(&s->coeffs_buf[2]);
-++    gpu_cache_flush(&s->coeffs_buf[3]);
-++    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
-++    gpu_cache_flush(&s->coeffs_buf[2]);
-++    gpu_cache_flush(&s->coeffs_buf[3]);
-+ 
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index a7561bd..3e6dabf 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1031,7 +1031,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     int vshift = s->ps.sps->vshift[c_idx];
-+     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-+                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-+-    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
-++#ifdef RPI
-++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size>=4;
-++#endif
-+     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+     uint8_t significant_coeff_group_flag[8][8] = {{0}};
-+     int explicit_rdpcm_flag = 0;
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index c0c279f..6d772d7 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -1,6 +1,10 @@
-+ unsigned char rpi_hevc_transform [] = {
-+ 169,
-+ 3,
-++62,
-++64,
-++79,
-++64,
-+ 3,
-+ 232,
-+ 32,
-+@@ -17,6 +21,22 @@ unsigned char rpi_hevc_transform [] = {
-+ 248,
-+ 0,
-+ 0,
-++64,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++248,
-++0,
-++168,
-++0,
-++0,
-++192,
-++248,
-++0,
-++0,
-+ 0,
-+ 96,
-+ 3,
-+@@ -79,7 +99,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 70,
-+ 128,
-+ 144,
-+-39,
-++40,
-+ 0,
-+ 4,
-+ 255,
-+@@ -113,7 +133,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 128,
-+ 144,
-+-22,
-++23,
-+ 0,
-+ 4,
-+ 255,
-+@@ -153,6 +173,8 @@ unsigned char rpi_hevc_transform [] = {
-+ 140,
-+ 211,
-+ 192,
-++34,
-++31,
-+ 41,
-+ 3,
-+ 70,
-+@@ -195,7 +217,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 255,
-+ 36,
-+ 204,
-+-96,
-++224,
-+ 2,
-+ 0,
-+ 248,
-+@@ -219,62 +241,10 @@ unsigned char rpi_hevc_transform [] = {
-+ 103,
-+ 90,
-+ 0,
-+-8,
-+-240,
-+-0,
-+-128,
-+-128,
-+-3,
-+-0,
-+-247,
-+-32,
-+-128,
-+-10,
-+-4,
-+-136,
-+-240,
-+-32,
-+-0,
-+-128,
-+-3,
-+-112,
-+-96,
-+-90,
-+-0,
-+-169,
-+-3,
-+-3,
-+-232,
-+-32,
-+-0,
-+-0,
-+-0,
-+-12,
-+-248,
-+-0,
-+-136,
-+-0,
-+-0,
-+-192,
-+-248,
-+-0,
-+-0,
-++225,
-++64,
-++242,
-+ 64,
-+-232,
-+-0,
-+-2,
-+-0,
-+-0,
-+-12,
-+-248,
-+-0,
-+-168,
-+-0,
-+-0,
-+-192,
-+-248,
-+-0,
-+-0,
-+ 3,
-+ 232,
-+ 128,
-+@@ -287,18 +257,6 @@ unsigned char rpi_hevc_transform [] = {
-+ 2,
-+ 0,
-+ 0,
-+-4,
-+-232,
-+-64,
-+-0,
-+-0,
-+-0,
-+-5,
-+-232,
-+-0,
-+-8,
-+-0,
-+-0,
-+ 57,
-+ 239,
-+ 224,
-+@@ -317,18 +275,26 @@ unsigned char rpi_hevc_transform [] = {
-+ 64,
-+ 26,
-+ 64,
-++4,
-++232,
-++64,
-++0,
-++0,
-++0,
-++149,
-++96,
-+ 161,
-+ 64,
-+ 152,
-+ 64,
-+ 128,
-+ 144,
-+-31,
-++35,
-+ 0,
-+ 72,
-+ 232,
-+-32,
-+ 0,
-++4,
-+ 0,
-+ 0,
-+ 65,
-+@@ -339,8 +305,16 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 128,
-+ 144,
-+-23,
-++27,
-++0,
-++4,
-++232,
-++0,
-++8,
-+ 0,
-++0,
-++69,
-++96,
-+ 145,
-+ 64,
-+ 168,
-+@@ -351,8 +325,8 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 72,
-+ 232,
-+-32,
-+ 0,
-++4,
-+ 0,
-+ 0,
-+ 65,
-+@@ -373,7 +347,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 242,
-+ 140,
-+-229,
-++221,
-+ 192,
-+ 57,
-+ 239,
-+@@ -383,6 +357,8 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 41,
-+ 3,
-++239,
-++3,
-+ 12,
-+ 248,
-+ 0,
-+@@ -390,7 +366,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 0,
-+ 192,
-+-8,
-++248,
-+ 4,
-+ 0,
-+ 12,
-+@@ -400,14 +376,14 @@ unsigned char rpi_hevc_transform [] = {
-+ 64,
-+ 0,
-+ 192,
-+-8,
-++248,
-+ 4,
-+ 0,
-+ 0,
-+ 96,
-+ 255,
-+ 159,
-+-131,
-++154,
-+ 255,
-+ 0,
-+ 232,
-+@@ -417,7 +393,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 255,
-+ 159,
-+-142,
-++165,
-+ 255,
-+ 4,
-+ 255,
-+@@ -429,7 +405,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 251,
-+ 62,
-+ 0,
-+-5,
-++4,
-+ 255,
-+ 51,
-+ 204,
-+@@ -439,15 +415,15 @@ unsigned char rpi_hevc_transform [] = {
-+ 251,
-+ 16,
-+ 0,
-+-77,
-++76,
-+ 254,
-+ 51,
-+ 204,
-+-9,
-+-4,
-++128,
-++3,
-+ 224,
-+ 251,
-+-0,
-++20,
-+ 0,
-+ 128,
-+ 64,
-+@@ -467,16 +443,6 @@ unsigned char rpi_hevc_transform [] = {
-+ 99,
-+ 0,
-+ 0,
-+-4,
-+-254,
-+-0,
-+-144,
-+-128,
-+-2,
-+-0,
-+-8,
-+-2,
-+-0,
-+ 32,
-+ 247,
-+ 240,
-+@@ -488,92 +454,92 @@ unsigned char rpi_hevc_transform [] = {
-+ 176,
-+ 207,
-+ 17,
-+-3,
-++19,
-+ 32,
-+ 247,
-+ 112,
-+ 207,
-+ 18,
-+-3,
-++35,
-+ 32,
-+ 247,
-+ 48,
-+ 207,
-+ 19,
-+-3,
-++51,
-+ 32,
-+ 247,
-+ 240,
-+ 206,
-+ 20,
-+-3,
-++67,
-+ 32,
-+ 247,
-+ 176,
-+ 206,
-+ 21,
-+-3,
-++83,
-+ 32,
-+ 247,
-+ 112,
-+ 206,
-+ 22,
-+-3,
-++99,
-+ 32,
-+ 247,
-+ 48,
-+ 206,
-+ 23,
-+-3,
-++115,
-+ 32,
-+ 247,
-+ 240,
-+ 205,
-+ 24,
-+-3,
-++131,
-+ 32,
-+ 247,
-+ 176,
-+ 205,
-+ 25,
-+-3,
-++147,
-+ 32,
-+ 247,
-+ 112,
-+ 205,
-+ 26,
-+-3,
-++163,
-+ 32,
-+ 247,
-+ 48,
-+ 205,
-+ 27,
-+-3,
-++179,
-+ 32,
-+ 247,
-+ 240,
-+ 204,
-+ 28,
-+-3,
-++195,
-+ 32,
-+ 247,
-+ 176,
-+ 204,
-+ 29,
-+-3,
-++211,
-+ 32,
-+ 247,
-+ 112,
-+ 204,
-+ 30,
-+-3,
-++227,
-+ 32,
-+ 247,
-+ 48,
-+ 204,
-+ 31,
-+-3,
-+-5,
-++243,
-++4,
-+ 255,
-+ 51,
-+ 204,
-+@@ -583,20 +549,20 @@ unsigned char rpi_hevc_transform [] = {
-+ 251,
-+ 16,
-+ 0,
-+-77,
-++76,
-+ 254,
-+ 51,
-+ 204,
-+-9,
-+-4,
-++128,
-++3,
-+ 224,
-+ 251,
-+-0,
-++20,
-+ 0,
-+ 0,
-+ 237,
-++32,
-+ 0,
-+-4,
-+ 0,
-+ 0,
-+ 140,
-+@@ -609,6 +575,6 @@ unsigned char rpi_hevc_transform [] = {
-+ 99,
-+ 0,
-+ 0,
-+-90,
-+-0,
-++111,
-++3,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index 1e389c7..afdb32a 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -76,12 +76,19 @@
-+ # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+ # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+ # num: number of 16x16 transforms to be done
-++# coeffs32
-++# num32: number of 32x32 transforms
-+ #
-+ hevc_trans_16x16:
-+   push r6-r15, lr # TODO cut down number of used registers
-+-
-++  mov r14,r3 # coeffs32
-++  mov r15,r4 # num32
-+   mov r3, 16*2 # Stride of transMatrix2 in bytes
-+   vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-++
-++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-++
-+   # Now use r0 to describe which matrix we are working on.
-+   # Allows us to prefetch the next block of coefficients for efficiency.
-+   mov r0,0 # This describes the location where we read our coefficients from
-+@@ -121,6 +128,10 @@ block_loop:
-+   add r1,r7
-+ 
-+   addcmpbgt r2,-1,0,block_loop
-++
-++  # Now go and do any 32x32 transforms
-++  b hevc_trans_32x32
-++
-+   pop r6-r15, pc
-+ 
-+ # r1,r2,r3 r7,r8 should be preserved
-+@@ -136,26 +147,18 @@ col_trans_16_loop:
-+   # Then sum up the results and place back
-+   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+   addcmpblt r0,1,r6,col_trans_16_loop
-+-  sub r0,16  # but r0 back to its original value
-++  sub r0,16  # put r0 back to its original value
-+   b lr
-+ 
-+ col_trans_odd_16:
-+   add r6,r0,16 # Final value for this loop
-+ col_trans_odd_16_loop:
-+   # First compute partial products for a single column
-+-  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+   # Then sum up the results and place back
-+   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+   addcmpblt r0,1,r6,col_trans_odd_16_loop
-+-  sub r0,16  # but r0 back to its original value
-+-  b lr
-+-
-+-
-+-test_add:
-+-  vldh HX(0,0),(r0)
-+-  vadd HX(0,0),HX(0,0),10
-+-  vsth HX(0,0),(r0)
-+-  mov r0,7 # return value
-++  sub r0,16  # put r0 back to its original value
-+   b lr
-+ 
-+ # hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+@@ -164,18 +167,17 @@ test_add:
-+ # num: number of 16x16 transforms to be done
-+ #
-+ hevc_trans_32x32:
-+-  push r6-r15, lr # TODO cut down number of used registers
-++  mov r1,r14 # coeffs
-++  mov r2,r15 # num
-+ 
-+-  # Fetch transform matrices
-+-  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+-  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+-  add r0, 16*16*2
-+-  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-++  # Fetch odd transform matrix
-++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-++  #add r0, 16*16*2
-++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+ 
-+   mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+   mov r7, 16*16*2 # Total block size
-+-  mov r4, 64 # Constant used for rounding first pass
-+-  mov r5, 1<<11 # Constant used for rounding second pass
-+   sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-+   # set r8 to 32byte aligned stack pointer
-+   add r8,sp,31
-+@@ -186,21 +188,27 @@ hevc_trans_32x32:
-+ block_loop32:
-+ 
-+   # COLUMN TRANSFORM
-++  mov r4, 64 # Constant used for rounding first pass
-++  mov r5, 9 # left shift used for rounding first pass
-++
-+   # Transform the first 16 columns
-+   mov r1,r10  # Input Coefficient buffer
-+   mov r8,r9   # Output temporary storage
-+   bl trans32
-+   # Transform the second 16 columns
-+-  add r8,32
-++  add r8,32*16*2
-+   add r1,32
-+   bl trans32
-+ 
-+   # ROW TRANSFORM
-++  mov r4, 1<<11 # Constant used for rounding second pass
-++  mov r5, 4 # left shift used for rounding second pass
-++
-+   mov r1,r9  # Input temporary storage
-+   mov r8,r10   # Output Coefficient buffer
-+   bl trans32
-+   # Transform the second 16 columns
-+-  add r8,32
-++  add r8,32*16*2
-+   add r1,32
-+   bl trans32
-+ 
-+@@ -212,11 +220,12 @@ block_loop32:
-+   pop r6-r15, pc
-+ 
-+ trans32:
-++  push lr
-+   # We can no longer afford the VRF space to do prefetching when doing 32x32
-+   # Fetch the even rows
-+-  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-++  vldh HX(0++,0),(r1 += r3) REP 16
-+   # Fetch the odd rows
-+-  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+ 
-+   # Transform the even rows using even matrix
-+   mov r0, 0 # Even rows
-+@@ -228,33 +237,32 @@ trans32:
-+ 
-+   # Now apply butterfly to compute the first 16 results
-+   vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+-  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-+-  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+   # 16bit results now in HX(48,32)
-+   mov r0,r8
-+   mov r6,32*2
-+   vsth VX(48,32++),(r0+=r6) REP 16
-+-  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
-+ 
-+   # Now apply butterfly to compute the second 16 results (in reverse order)
-+-  vsub HY(63,0),HY(0,0),HY(16,0)
-+-  vsub HY(62,0),HY(0,0),HY(17,0)
-+-  vsub HY(61,0),HY(0,0),HY(18,0)
-+-  vsub HY(60,0),HY(0,0),HY(19,0)
-+-  vsub HY(59,0),HY(0,0),HY(20,0)
-+-  vsub HY(58,0),HY(0,0),HY(21,0)
-+-  vsub HY(57,0),HY(0,0),HY(22,0)
-+-  vsub HY(56,0),HY(0,0),HY(23,0)
-+-  vsub HY(55,0),HY(0,0),HY(24,0)
-+-  vsub HY(54,0),HY(0,0),HY(25,0)
-+-  vsub HY(53,0),HY(0,0),HY(26,0)
-+-  vsub HY(52,0),HY(0,0),HY(27,0)
-+-  vsub HY(51,0),HY(0,0),HY(28,0)
-+-  vsub HY(50,0),HY(0,0),HY(29,0)
-+-  vsub HY(49,0),HY(0,0),HY(30,0)
-+-  vsub HY(48,0),HY(0,0),HY(31,0)
-+-  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-+-  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-+-  add r0,r8,16*32*2 # Move to 16th row
-++  vsub HY(63,0),HY(0 ,0),HY(16,0)
-++  vsub HY(62,0),HY(1 ,0),HY(17,0)
-++  vsub HY(61,0),HY(2 ,0),HY(18,0)
-++  vsub HY(60,0),HY(3 ,0),HY(19,0)
-++  vsub HY(59,0),HY(4 ,0),HY(20,0)
-++  vsub HY(58,0),HY(5 ,0),HY(21,0)
-++  vsub HY(57,0),HY(6 ,0),HY(22,0)
-++  vsub HY(56,0),HY(7 ,0),HY(23,0)
-++  vsub HY(55,0),HY(8 ,0),HY(24,0)
-++  vsub HY(54,0),HY(9 ,0),HY(25,0)
-++  vsub HY(53,0),HY(10,0),HY(26,0)
-++  vsub HY(52,0),HY(11,0),HY(27,0)
-++  vsub HY(51,0),HY(12,0),HY(28,0)
-++  vsub HY(50,0),HY(13,0),HY(29,0)
-++  vsub HY(49,0),HY(14,0),HY(30,0)
-++  vsub HY(48,0),HY(15,0),HY(31,0)
-++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-++  add r0,r8,32
-+   vsth VX(48,32++),(r0+=r6) REP 16
-+-  b lr
-++  pop pc
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index d720546..12ad5fb 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -89,7 +89,7 @@ struct GPU
-+ {
-+   unsigned int qpu_code[QPU_CODE_SIZE];
-+   unsigned int vpu_code[VPU_CODE_SIZE];
-+-  short transMatrix2even[16*16];
-++  short transMatrix2even[16*16*2];
-+   int open_count; // Number of allocated video buffers
-+   unsigned int vc_handle; // Handle of this memory
-+   int      mb; // Mailbox handle
-+@@ -162,7 +162,7 @@ static int gpu_init(volatile struct GPU **gpu) {
-+     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-+   }
-+   // And the transform coefficients
-+-  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
-++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+ 
-+   return 0;
-+ }
-+-- 
-+2.5.0
-+
-+
-+From 223fee0c814602a2aa5611c21fe052e6b6e063c1 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 5 May 2015 16:57:03 +0100
-+Subject: [PATCH 07/68] Clear coefficients in advance
-+
-+---
-+ libavcodec/hevc.c               | 129 ++++++++++++++++++++++++++++------------
-+ libavcodec/hevc.h               |   6 +-
-+ libavcodec/hevc_cabac.c         |   7 ++-
-+ libavcodec/rpi_hevc_transform.h |  50 ++++++++++++++++
-+ libavcodec/rpi_hevc_transform.s |  16 +++++
-+ 5 files changed, 168 insertions(+), 40 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index fbbd30f..12e66a6 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -41,6 +41,8 @@
-+ 
-+ #ifdef RPI
-+ #include "rpi_qpu.h"
-++// For some unknown reason, the code seems to crash if I do a late malloc
-++#define EARLY_MALLOC
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -59,6 +61,20 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ /* free everything allocated  by pic_arrays_init() */
-+ static void pic_arrays_free(HEVCContext *s)
-+ {
-++#ifdef RPI
-++#ifdef EARLY_MALLOC
-++#else
-++    printf("pic_arrays_free\n");
-++    if (s->coeffs_buf_arm[0]) {
-++      gpu_free(&s->coeffs_buf_default);
-++      s->coeffs_buf_arm[0] = 0;
-++    }
-++    if (s->coeffs_buf_arm[2]) {
-++      gpu_free(&s->coeffs_buf_accelerated);
-++      s->coeffs_buf_arm[2] = 0;
-++    }
-++#endif
-++#endif
-+     av_freep(&s->sao);
-+     av_freep(&s->deblock);
-+ 
-+@@ -95,6 +111,28 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+     int ctb_count        = sps->ctb_width * sps->ctb_height;
-+     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
-+ 
-++#ifdef RPI
-++#ifdef EARLY_MALLOC
-++#else
-++    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
-++    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-++    printf("pic_arrays_init\n");
-++    printf("Allocated %d\n",coefs_per_row);
-++    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-++    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-++    if (!s->coeffs_buf_arm[0])
-++        goto fail;
-++    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-++    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-++    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-++    if (!s->coeffs_buf_arm[2])
-++        goto fail;
-++    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-++    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-++    printf("Done\n");
-++#endif
-++#endif
-++
-+     s->bs_width  = (width  >> 2) + 1;
-+     s->bs_height = (height >> 2) + 1;
-+ 
-+@@ -2387,11 +2425,10 @@ static void rpi_execute_transform(HEVCContext *s)
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-    gpu_cache_flush(&s->coeffs_buf[2]);
-+-    gpu_cache_flush(&s->coeffs_buf[3]);
-+-    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
-+-    gpu_cache_flush(&s->coeffs_buf[2]);
-+-    gpu_cache_flush(&s->coeffs_buf[3]);
-++
-++    gpu_cache_flush(&s->coeffs_buf_accelerated);
-++    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-++    //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+ 
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+@@ -2413,7 +2450,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-+       } else {
-++          int trafo_size = 1 << cmd->size;
-+           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-++          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-+       }
-+   }
-+   s->num_pred_cmds = 0;
-+@@ -3158,10 +3197,18 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->unif_mv_cmds);
-+     av_freep(&s->unif_xfm_cmds);
-+     av_freep(&s->univ_pred_cmds);
-+-    for(i = 0; i < 4; i++) {
-+-        gpu_free(&s->coeffs_buf[i]);
-++
-++#ifdef EARLY_MALLOC
-++    if (s->coeffs_buf_arm[0]) {
-++      gpu_free(&s->coeffs_buf_default);
-++      s->coeffs_buf_arm[0] = 0;
-++    }
-++    if (s->coeffs_buf_arm[2]) {
-++      gpu_free(&s->coeffs_buf_accelerated);
-++      s->coeffs_buf_arm[2] = 0;
-+     }
-+ #endif
-++#endif
-+ 
-+     for (i = 0; i < 3; i++) {
-+         av_freep(&s->sao_pixel_buffer_h[i]);
-+@@ -3209,6 +3256,16 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     return 0;
-+ }
-+ 
-++#ifdef RPI
-++static av_cold void memclear16(int16_t *p, int n)
-++{
-++  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-++  //int i;
-++  //for(i=0;i<n;i++)
-++  //  p[i] = 0;
-++}
-++#endif
-++
-+ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+ {
-+     HEVCContext *s = avctx->priv_data;
-+@@ -3232,37 +3289,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+-    for(i = 0; i < 4; i++) {
-+-        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-+-        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
-+-        if (!s->coeffs_buf_arm[i])
-+-            goto fail;
-+-    }
-+-    s->enable_rpi = 0;
-+ 
-+-    // A little test program
-+-    /*{
-+-      GPU_MEM_PTR_T p;
-+-      int err = gpu_malloc_cached(16, &p);
-+-      short *q = (short *)p.arm;
-+-      int i;
-+-      int r;
-+-      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
-+-      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
-+-      printf("Preparing data %p\n",q);
-+-      for(i=0;i<16;i++)
-+-        q[i] = i;
-+-      printf("Flush cache\n");
-+-      gpu_cache_flush(&p);
-+-      printf("Executing code\n");
-+-      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
-+-      printf("Return value %d (",r);
-+-      for(i=0;i<16;i++)
-+-        printf("%d ",q[i]);
-+-      printf(")\n");
-+-      gpu_free(&p);
-+-      goto fail; // Early out
-+-    }*/
-++    s->coeffs_buf_arm[0] = 0;
-++    s->coeffs_buf_arm[2] = 0;
-++
-++#ifdef EARLY_MALLOC
-++    int coeffs_in_ctb = 64*64;
-++    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-++    printf("Allocated %d\n",coefs_per_row);
-++    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-++    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-++    if (!s->coeffs_buf_arm[0])
-++        goto fail;
-++    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-++    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-++    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-++    if (!s->coeffs_buf_arm[2])
-++        goto fail;
-++    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-++    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-++    printf("Done\n");
-++    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-++    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-++    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-++    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-++    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-++    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-++#endif
-++
-++    s->enable_rpi = 0;
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index f201817..ca7c2aa 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -887,8 +887,12 @@ typedef struct HEVCContext {
-+     HEVCMvCmd *unif_mv_cmds;
-+     HEVCXfmCmd *unif_xfm_cmds;
-+     HEVCPredCmd *univ_pred_cmds;
-+-    GPU_MEM_PTR_T coeffs_buf[4];
-++    int buf_width;
-++    GPU_MEM_PTR_T coeffs_buf_default;
-++    GPU_MEM_PTR_T coeffs_buf_accelerated;
-+     int16_t *coeffs_buf_arm[4];
-++    unsigned int coeffs_buf_vc[4];
-++
-+     int num_coeffs[4];
-+     int num_xfm_cmds;
-+     int num_mv_cmds;
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index 3e6dabf..a295d3e 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1058,9 +1058,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             s->num_coeffs[0] += n;
-+         }
-+     }
-++    // We now do the memset after transform_add while we know the data is cached.
-++    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-++#else
-++    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+ #endif
-+ 
-+-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-++
-+ 
-+     // Derive QP for dequant
-+     if (!lc->cu.cu_transquant_bypass_flag) {
-+@@ -1547,7 +1551,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+ #ifdef RPI
-+     if (s->enable_rpi) {
-+         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-+-        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
-+         cmd->type = RPI_PRED_TRANSFORM_ADD;
-+         cmd->size = log2_trafo_size;
-+         cmd->buf = coeffs;
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index 6d772d7..4f13622 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -1,4 +1,10 @@
-+ unsigned char rpi_hevc_transform [] = {
-++21,
-++106,
-++0,
-++144,
-++35,
-++1,
-+ 169,
-+ 3,
-+ 62,
-+@@ -577,4 +583,48 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 111,
-+ 3,
-++4,
-++254,
-++0,
-++128,
-++0,
-++4,
-++0,
-++248,
-++0,
-++0,
-++2,
-++232,
-++32,
-++0,
-++0,
-++0,
-++140,
-++248,
-++32,
-++0,
-++0,
-++0,
-++224,
-++35,
-++0,
-++0,
-++64,
-++232,
-++0,
-++2,
-++0,
-++0,
-++193,
-++232,
-++0,
-++1,
-++0,
-++0,
-++1,
-++106,
-++116,
-++30,
-++90,
-++0,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index afdb32a..fd159bc 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -78,8 +78,11 @@
-+ # num: number of 16x16 transforms to be done
-+ # coeffs32
-+ # num32: number of 32x32 transforms
-++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+ #
-+ hevc_trans_16x16:
-++  cmp r5,1
-++  beq memclear16
-+   push r6-r15, lr # TODO cut down number of used registers
-+   mov r14,r3 # coeffs32
-+   mov r15,r4 # num32
-+@@ -266,3 +269,16 @@ trans32:
-+   add r0,r8,32
-+   vsth VX(48,32++),(r0+=r6) REP 16
-+   pop pc
-++
-++memclear16:
-++  # r0 is address
-++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
-++  vmov HX(0++,0),0 REP 16
-++  mov r2,32
-++loop:
-++  vsth HX(0++,0),(r0+=r2) REP 16
-++  add r0,16*16*2
-++  sub r1,16*16
-++  cmp r1,0
-++  bgt loop
-++  b lr
-+-- 
-+2.5.0
-+
-+
-+From dffd0d9fc1ada2b61c61c73cba53538e564ced02 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 6 May 2015 09:56:43 +0100
-+Subject: [PATCH 08/68] Prepared inter offload
-+
-+---
-+ libavcodec/hevc.c       | 116 +++++++++++++++++++++++++++++++++++++++++++-----
-+ libavcodec/hevc.h       |  29 +++++++++++-
-+ libavcodec/hevc_cabac.c |   5 ++-
-+ 3 files changed, 137 insertions(+), 13 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 12e66a6..7453b63 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -43,6 +43,8 @@
-+ #include "rpi_qpu.h"
-+ // For some unknown reason, the code seems to crash if I do a late malloc
-+ #define EARLY_MALLOC
-++// Move Inter prediction into separate pass
-++//#define RPI_INTER
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -1427,6 +1429,95 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-+  * @param luma_offset additive offset applied to the luma prediction value
-+  */
-+ 
-++#ifdef RPI_INTER
-++#define RPI_REDIRECT(fn) rpi_ ## fn
-++static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-++                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
-++                        int block_w, int block_h, int luma_weight, int luma_offset)
-++{
-++    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    cmd->cmd = RPI_CMD_LUMA_UNI;
-++    cmd->dst = dst;
-++    cmd->dststride = dststride;
-++    cmd->src = ref->data[0];
-++    cmd->srcstride = ref->linesize[0];
-++    cmd->mv = *mv;
-++    cmd->x_off = x_off;
-++    cmd->y_off = y_off;
-++    cmd->block_w = block_w;
-++    cmd->block_h = block_h;
-++    cmd->weight = luma_weight;
-++    cmd->offset = luma_offset;
-++}
-++
-++static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-++                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-++{
-++    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    cmd->cmd = RPI_CMD_LUMA_BI;
-++    cmd->dst = dst;
-++    cmd->dststride = dststride;
-++    cmd->src = ref->data[0];
-++    cmd->srcstride = ref->linesize[0];
-++    cmd->mv = *mv;
-++    cmd->x_off = x_off;
-++    cmd->y_off = y_off;
-++    cmd->block_w = block_w;
-++    cmd->block_h = block_h;
-++    cmd->weight = luma_weight;
-++    cmd->offset = luma_offset;
-++    cmd->src1 = ref1->data[];
-++    cmd->srcstride1 = ref1->linesize[0];
-++    cmd->mv1 = *mv1;
-++    cmd->ref_idx[0] = current_mv->ref_idx[0];
-++    cmd->ref_idx[1] = current_mv->ref_idx[1];
-++}
-++
-++static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-++                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
-++{
-++    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    cmd->cmd = RPI_CMD_CHROMA_UNI;
-++    cmd->dst = dst0;
-++    cmd->dststride = dststride;
-++    cmd->src = src0;
-++    cmd->srcstride = srcstride;
-++    cmd->mv = current_mv->mv[reflist];
-++    cmd->x_off = x_off;
-++    cmd->y_off = y_off;
-++    cmd->block_w = block_w;
-++    cmd->block_h = block_h;
-++    cmd->weight = chroma_weight;
-++    cmd->offset = chroma_offset;
-++}
-++
-++static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-++                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
-++{
-++    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-++    cmd->dst = dst0;
-++    cmd->dststride = dststride;
-++    cmd->src = ref0->data[cidx+1];
-++    cmd->srcstride = ref0->linesize[cidx+1];
-++    cmd->mv = current_mv->mv[reflist];
-++    cmd->x_off = x_off;
-++    cmd->y_off = y_off;
-++    cmd->block_w = block_w;
-++    cmd->block_h = block_h;
-++    cmd->weight = chroma_weight;
-++    cmd->offset = chroma_offset;
-++    cmd->src = ref1->data[cidx+1];
-++    cmd->srcstride1 = ref1->linesize[cidx+1];
-++    cmd->ref_idx[0] = current_mv->ref_idx[0];
-++    cmd->ref_idx[1] = current_mv->ref_idx[1];
-++}
-++#else
-++#define RPI_REDIRECT(fn) fn
-++#endif
-++
-+ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                         int block_w, int block_h, int luma_weight, int luma_offset)
-+@@ -1492,7 +1583,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+  * @param mv1 motion vector1 (relative to block position) to get pixel data from
-+  * @param current_mv current motion vector structure
-+  */
-+- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-++static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-+ {
-+@@ -1874,16 +1965,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
-++        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-+                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
-+                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-+                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-++            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-+                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-+-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-++            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-+                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
-+         }
-+@@ -1893,17 +1984,17 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
-++        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-+                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
-+                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-+                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-++            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-+                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
-+ 
-+-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-++            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-+                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
-+         }
-+@@ -1913,15 +2004,15 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
-++        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-+                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
-+                    ref1->frame, &current_mv.mv[1], &current_mv);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+-            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-++            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-+                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
-+ 
-+-            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-++            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-+                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
-+         }
-+     }
-+@@ -2452,7 +2543,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+       } else {
-+           int trafo_size = 1 << cmd->size;
-+           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-++#ifdef RPI_PRECLEAR
-+           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-++#endif
-+       }
-+   }
-+   s->num_pred_cmds = 0;
-+@@ -3309,6 +3402,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+     s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+     printf("Done\n");
-++#ifdef RPI_PRECLEAR
-+     //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-+     memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-+     //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-+@@ -3317,6 +3411,8 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-+ #endif
-+ 
-++#endif
-++
-+     s->enable_rpi = 0;
-+ 
-+ #endif
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index ca7c2aa..8ef6f51 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -829,14 +829,39 @@ typedef struct HEVCLocalContext {
-+ // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+ #define RPI_MAX_WIDTH 2048
-+ 
-+-// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
-+-#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
-++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-++#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
-+ #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-+ // Each block can have an intra prediction and a transform_add command
-+ #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-+ 
-++#define RPI_CMD_LUMA_UNI 0
-++#define RPI_CMD_CHROMA_UNI 1
-++#define RPI_CMD_LUMA_BI 2
-++#define RPI_CMD_U_BI 3
-++#define RPI_CMD_V_BI 4
-++
-++// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-++// #define RPI_PRECLEAR
-++
-+ // Command for inter prediction
-+ typedef struct HEVCMvCmd {
-++    int cmd;
-++    uint8_t *dst;
-++    ptrdiff_t dststride;
-++    uint8_t *src;
-++    ptrdiff_t srcstride;
-++    Mv mv;
-++    int x_off;
-++    int y_off;
-++    int block_w;
-++    int block_h;
-++    int weight;
-++    int offset;
-++    uint8_t *src1;
-++    ptrdiff_t srcstride1;
-++    Mv mv1;
-++    int8_t ref_idx[2];
-+ } HEVCMvCmd;
-+ 
-+ // Command for transform to process a block of coefficients
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index a295d3e..f28759b 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1059,7 +1059,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+         }
-+     }
-+     // We now do the memset after transform_add while we know the data is cached.
-+-    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-++    #ifdef RPI_PRECLEAR
-++    #else
-++    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-++    #endif
-+ #else
-+     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From fa1aa086848e704e43a90d09ddf35a5e7d99aae2 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 6 May 2015 11:08:50 +0100
-+Subject: [PATCH 09/68] Inter prediction in separate pass
-+
-+---
-+ libavcodec/hevc.c | 93 +++++++++++++++++++++++++++++++++++++++++++++----------
-+ libavcodec/hevc.h |  2 +-
-+ 2 files changed, 77 insertions(+), 18 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 7453b63..83fdb57 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -44,7 +44,7 @@
-+ // For some unknown reason, the code seems to crash if I do a late malloc
-+ #define EARLY_MALLOC
-+ // Move Inter prediction into separate pass
-+-//#define RPI_INTER
-++#define RPI_INTER
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -1435,7 +1435,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                         int block_w, int block_h, int luma_weight, int luma_offset)
-+ {
-+-    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+     cmd->cmd = RPI_CMD_LUMA_UNI;
-+     cmd->dst = dst;
-+     cmd->dststride = dststride;
-+@@ -1454,31 +1454,29 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-+ {
-+-    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+     cmd->cmd = RPI_CMD_LUMA_BI;
-+     cmd->dst = dst;
-+     cmd->dststride = dststride;
-+-    cmd->src = ref->data[0];
-+-    cmd->srcstride = ref->linesize[0];
-+-    cmd->mv = *mv;
-++    cmd->src = ref0->data[0];
-++    cmd->srcstride = ref0->linesize[0];
-++    cmd->mv = *mv0;
-+     cmd->x_off = x_off;
-+     cmd->y_off = y_off;
-+     cmd->block_w = block_w;
-+     cmd->block_h = block_h;
-+-    cmd->weight = luma_weight;
-+-    cmd->offset = luma_offset;
-+-    cmd->src1 = ref1->data[];
-++    cmd->src1 = ref1->data[0];
-+     cmd->srcstride1 = ref1->linesize[0];
-+     cmd->mv1 = *mv1;
-+     cmd->ref_idx[0] = current_mv->ref_idx[0];
-+     cmd->ref_idx[1] = current_mv->ref_idx[1];
-+ }
-+ 
-+-static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-++static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-+                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
-+ {
-+-    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+     cmd->cmd = RPI_CMD_CHROMA_UNI;
-+     cmd->dst = dst0;
-+     cmd->dststride = dststride;
-+@@ -1493,27 +1491,27 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+     cmd->offset = chroma_offset;
-+ }
-+ 
-+-static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-++static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
-+ {
-+-    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-+     cmd->dst = dst0;
-+     cmd->dststride = dststride;
-+     cmd->src = ref0->data[cidx+1];
-+     cmd->srcstride = ref0->linesize[cidx+1];
-+-    cmd->mv = current_mv->mv[reflist];
-++    cmd->mv = current_mv->mv[0];
-++    cmd->mv1 = current_mv->mv[1];
-+     cmd->x_off = x_off;
-+     cmd->y_off = y_off;
-+     cmd->block_w = block_w;
-+     cmd->block_h = block_h;
-+-    cmd->weight = chroma_weight;
-+-    cmd->offset = chroma_offset;
-+-    cmd->src = ref1->data[cidx+1];
-++    cmd->src1 = ref1->data[cidx+1];
-+     cmd->srcstride1 = ref1->linesize[cidx+1];
-+     cmd->ref_idx[0] = current_mv->ref_idx[0];
-+     cmd->ref_idx[1] = current_mv->ref_idx[1];
-+ }
-++
-+ #else
-+ #define RPI_REDIRECT(fn) fn
-+ #endif
-+@@ -2541,7 +2539,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-+       } else {
-++#ifdef RPI_PRECLEAR
-+           int trafo_size = 1 << cmd->size;
-++#endif
-+           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-+ #ifdef RPI_PRECLEAR
-+           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-+@@ -2550,6 +2550,61 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+   }
-+   s->num_pred_cmds = 0;
-+ }
-++
-++static void rpi_execute_inter_cmds(HEVCContext *s)
-++{
-++    HEVCMvCmd *cmd = s->unif_mv_cmds;
-++    int n,cidx;
-++    AVFrame myref;
-++    AVFrame myref1;
-++    struct MvField mymv;
-++    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
-++        printf("Overflow inter_cmds\n");
-++        exit(-1);
-++    }
-++    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
-++        switch(cmd->cmd) {
-++        case RPI_CMD_LUMA_UNI:
-++            myref.data[0] = cmd->src;
-++            myref.linesize[0] = cmd->srcstride;
-++            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
-++            break;
-++        case RPI_CMD_LUMA_BI:
-++            myref.data[0] = cmd->src;
-++            myref.linesize[0] = cmd->srcstride;
-++            myref1.data[0] = cmd->src1;
-++            myref1.linesize[0] = cmd->srcstride1;
-++            mymv.ref_idx[0] = cmd->ref_idx[0];
-++            mymv.ref_idx[1] = cmd->ref_idx[1];
-++            luma_mc_bi(s, cmd->dst, cmd->dststride,
-++                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
-++                       &myref1, &cmd->mv1, &mymv);
-++            break;
-++        case RPI_CMD_CHROMA_UNI:
-++            mymv.mv[0] = cmd->mv;
-++            chroma_mc_uni(s, cmd->dst,
-++                          cmd->dststride, cmd->src, cmd->srcstride, 0,
-++                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
-++            break;
-++        case RPI_CMD_CHROMA_BI:
-++        case RPI_CMD_CHROMA_BI+1:
-++            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
-++            myref.data[cidx+1] = cmd->src;
-++            myref.linesize[cidx+1] = cmd->srcstride;
-++            myref1.data[cidx+1] = cmd->src1;
-++            myref1.linesize[cidx+1] = cmd->srcstride1;
-++            mymv.ref_idx[0] = cmd->ref_idx[0];
-++            mymv.ref_idx[1] = cmd->ref_idx[1];
-++            mymv.mv[0] = cmd->mv;
-++            mymv.mv[1] = cmd->mv1;
-++            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
-++                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
-++            break;
-++        }
-++    }
-++    s->num_mv_cmds = 0;
-++}
-++
-+ #endif
-+ 
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+@@ -2598,6 +2653,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI
-+         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-+             int x;
-++            // Perform inter prediction
-++            rpi_execute_inter_cmds(s);
-+             // Transform all blocks
-+             rpi_execute_transform(s);
-+             // Perform intra prediction and residual reconstruction
-+@@ -3350,6 +3407,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+ }
-+ 
-+ #ifdef RPI
-++#ifdef RPI_PRECLEAR
-+ static av_cold void memclear16(int16_t *p, int n)
-+ {
-+   vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-+@@ -3358,6 +3416,7 @@ static av_cold void memclear16(int16_t *p, int n)
-+   //  p[i] = 0;
-+ }
-+ #endif
-++#endif
-+ 
-+ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+ {
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 8ef6f51..8115d04 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -838,7 +838,7 @@ typedef struct HEVCLocalContext {
-+ #define RPI_CMD_LUMA_UNI 0
-+ #define RPI_CMD_CHROMA_UNI 1
-+ #define RPI_CMD_LUMA_BI 2
-+-#define RPI_CMD_U_BI 3
-++#define RPI_CMD_CHROMA_BI 3
-+ #define RPI_CMD_V_BI 4
-+ 
-+ // RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-+-- 
-+2.5.0
-+
-+
-+From eba684df008749ec0f5751ea2343198006682a1c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 6 May 2015 13:03:50 +0100
-+Subject: [PATCH 10/68] Added VPU thread
-+
-+---
-+ libavcodec/hevc.c    |  11 +++--
-+ libavcodec/hevc.h    |   1 +
-+ libavcodec/rpi_qpu.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++--
-+ libavcodec/rpi_qpu.h |   2 +
-+ 4 files changed, 133 insertions(+), 6 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 83fdb57..9b3edf2 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2516,8 +2516,10 @@ static void rpi_execute_transform(HEVCContext *s)
-+ 
-+ 
-+     gpu_cache_flush(&s->coeffs_buf_accelerated);
-+-    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-++    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-++    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-++    //vpu_wait(s->vpu_id);
-+ 
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+@@ -2653,10 +2655,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI
-+         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-+             int x;
-+-            // Perform inter prediction
-+-            rpi_execute_inter_cmds(s);
-+             // Transform all blocks
-+             rpi_execute_transform(s);
-++            // Perform inter prediction
-++            rpi_execute_inter_cmds(s);
-++            // Wait for transform completion
-++            vpu_wait(s->vpu_id);
-+             // Perform intra prediction and residual reconstruction
-+             rpi_execute_pred_cmds(s);
-+             // Perform deblocking for CTBs in this row
-+@@ -3349,6 +3353,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->univ_pred_cmds);
-+ 
-+ #ifdef EARLY_MALLOC
-++    printf("hevc_decode_free\n");
-+     if (s->coeffs_buf_arm[0]) {
-+       gpu_free(&s->coeffs_buf_default);
-+       s->coeffs_buf_arm[0] = 0;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 8115d04..d5d3302 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -922,6 +922,7 @@ typedef struct HEVCContext {
-+     int num_xfm_cmds;
-+     int num_mv_cmds;
-+     int num_pred_cmds;
-++    int vpu_id;
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 12ad5fb..378dd74 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -1,9 +1,13 @@
-+ #ifdef RPI
-+-// Use the vcsm device for shared memory
-++// define RPI_USE_VCSM to use the vcsm device for shared memory
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+ #define RPI_USE_VCSM
-+-#define RPI_TIME_TOTAL_QPU
-+-#define RPI_TIME_TOTAL_VPU
-++// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-++//#define RPI_TIME_TOTAL_QPU
-++// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-++//#define RPI_TIME_TOTAL_VPU
-++// define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-++#define RPI_ASYNC
-+ 
-+ #include <stdio.h>
-+ #include <stdlib.h>
-+@@ -113,6 +117,19 @@ static unsigned int Microseconds(void) {
-+ }
-+ #endif
-+ 
-++#ifdef RPI_ASYNC
-++pthread_t vpu_thread;
-++static void *vpu_start(void *arg);
-++
-++#define MAXCMDS 128
-++static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
-++static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-++
-++static int vpu_cmds[MAXCMDS][8];
-++static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-++static volatile int vpu_async_head=0;
-++#endif
-++
-+ // Connect to QPU, returns 0 on success.
-+ static int gpu_init(volatile struct GPU **gpu) {
-+   int mb = mbox_open();
-+@@ -164,12 +181,27 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   // And the transform coefficients
-+   memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+ 
-++#ifdef RPI_ASYNC
-++  {
-++    int err;
-++    vpu_async_tail = 0;
-++    vpu_async_head = 0;
-++    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
-++    //printf("Created thread\n");
-++    if (err) {
-++        printf("Failed to create vpu thread\n");
-++        return -4;
-++    }
-++  }
-++#endif
-++
-+   return 0;
-+ }
-+ 
-+ // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+ static void gpu_lock(void) {
-+   pthread_mutex_lock(&gpu_mutex);
-++
-+   if (gpu==NULL) {
-+     gpu_init(&gpu);
-+   }
-+@@ -264,6 +296,16 @@ static void gpu_term(void)
-+ 	unsigned handle = gpu->vc_handle;
-+   if (gpu==NULL)
-+     return;
-++
-++#ifdef RPI_ASYNC
-++  {
-++    void *res;
-++    vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
-++    pthread_join(vpu_thread, &res);
-++  }
-++#endif
-++
-++
-+ 	unmapmem((void*)gpu, sizeof(struct GPU));
-+ 	mem_unlock(mb, handle);
-+ 	mem_free(mb, handle);
-+@@ -322,6 +364,79 @@ unsigned int vpu_get_constants(void) {
-+   return gpu->vc + offsetof(struct GPU,transMatrix2even);
-+ }
-+ 
-++#ifdef RPI_ASYNC
-++
-++static void *vpu_start(void *arg) {
-++  while(1) {
-++    pthread_mutex_lock(&post_mutex);
-++    while( vpu_async_tail - vpu_async_head <= 0)
-++    {
-++      //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
-++      pthread_cond_wait(&post_cond, &post_mutex);
-++    }
-++    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-++    pthread_mutex_unlock(&post_mutex);
-++
-++    if (p[6] == -1) {
-++      break; // Last job
-++    }
-++    if (p[7]) {
-++        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-++        //gpu_cache_flush(buf);
-++    }
-++    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-++
-++    pthread_mutex_lock(&post_mutex);
-++    vpu_async_head++;
-++    pthread_cond_broadcast(&post_cond);
-++    pthread_mutex_unlock(&post_mutex);
-++  }
-++
-++  return NULL;
-++}
-++
-++// Post a command to the queue
-++// Returns an id which we can use to wait for completion
-++int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
-++{
-++  pthread_mutex_lock(&post_mutex);
-++  {
-++    int id = vpu_async_tail++;
-++    int *p = vpu_cmds[id%MAXCMDS];
-++    int num = vpu_async_tail - vpu_async_head;
-++    if (num>MAXCMDS) {
-++      printf("Too many commands submitted\n");
-++      exit(-1);
-++    }
-++    p[0] = code;
-++    p[1] = r0;
-++    p[2] = r1;
-++    p[3] = r2;
-++    p[4] = r3;
-++    p[5] = r4;
-++    p[6] = r5;
-++    p[7] = (int) buf;
-++    if (num<=1)
-++      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
-++    pthread_mutex_unlock(&post_mutex);
-++    return id;
-++  }
-++}
-++
-++// Wait for completion of the given command
-++void vpu_wait(int id)
-++{
-++  pthread_mutex_lock(&post_mutex);
-++  while( id + 1 - vpu_async_head > 0)
-++  {
-++    pthread_cond_wait(&post_cond, &post_mutex);
-++  }
-++  pthread_mutex_unlock(&post_mutex);
-++}
-++
-++#endif
-++
-++
-+ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-+ {
-+   unsigned r;
-+@@ -334,7 +449,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
-+   static int count=0;
-+   static long long countr2=0;
-+ #endif
-++#ifndef RPI_ASYNC
-+   gpu_lock();
-++#endif
-+ #ifdef RPI_TIME_TOTAL_VPU
-+   start_time = Microseconds();
-+   if (last_time==0)
-+@@ -351,7 +468,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
-+   if ((count&0x7f)==0)
-+     printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
-+ #endif
-++#ifndef RPI_ASYNC
-+   gpu_unlock();
-++#endif
-+   return r;
-+ }
-+ 
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 814fc3c..3526fce 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -36,6 +36,8 @@ extern unsigned int qpu_get_fn(int num);
-+ extern unsigned int vpu_get_fn(void);
-+ extern unsigned int vpu_get_constants(void);
-+ extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-++extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-++extern void vpu_wait( int id);
-+ 
-+ // Simple test of shader code
-+ extern int rpi_test_shader(void);
-+-- 
-+2.5.0
-+
-+
-+From d0d2aad80a05a30b2aca1d96dec3856c3a8d0ab9 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 6 May 2015 15:03:37 +0100
-+Subject: [PATCH 11/68] Added different signal when tail moves
-+
-+---
-+ libavcodec/rpi_qpu.c | 11 ++++++-----
-+ 1 file changed, 6 insertions(+), 5 deletions(-)
-+
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 378dd74..d1c3e20 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -122,7 +122,8 @@ pthread_t vpu_thread;
-+ static void *vpu_start(void *arg);
-+ 
-+ #define MAXCMDS 128
-+-static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
-++static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
-++static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
-+ static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ 
-+ static int vpu_cmds[MAXCMDS][8];
-+@@ -372,7 +373,7 @@ static void *vpu_start(void *arg) {
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
-+-      pthread_cond_wait(&post_cond, &post_mutex);
-++      pthread_cond_wait(&post_cond_tail, &post_mutex);
-+     }
-+     int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-+     pthread_mutex_unlock(&post_mutex);
-+@@ -388,7 +389,7 @@ static void *vpu_start(void *arg) {
-+ 
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+-    pthread_cond_broadcast(&post_cond);
-++    pthread_cond_broadcast(&post_cond_head);
-+     pthread_mutex_unlock(&post_mutex);
-+   }
-+ 
-+@@ -417,7 +418,7 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
-+     p[6] = r5;
-+     p[7] = (int) buf;
-+     if (num<=1)
-+-      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
-++      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-+     pthread_mutex_unlock(&post_mutex);
-+     return id;
-+   }
-+@@ -429,7 +430,7 @@ void vpu_wait(int id)
-+   pthread_mutex_lock(&post_mutex);
-+   while( id + 1 - vpu_async_head > 0)
-+   {
-+-    pthread_cond_wait(&post_cond, &post_mutex);
-++    pthread_cond_wait(&post_cond_head, &post_mutex);
-+   }
-+   pthread_mutex_unlock(&post_mutex);
-+ }
-+-- 
-+2.5.0
-+
-+
-+From dcb7e7134ab80be7971979f9893a83814d7ea962 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 7 May 2015 08:57:11 +0100
-+Subject: [PATCH 12/68] Add option to test for gpu_idle
-+
-+---
-+ libavcodec/hevc.c    |  3 ++-
-+ libavcodec/rpi_qpu.c | 18 ++++++++++++++++++
-+ 2 files changed, 20 insertions(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 9b3edf2..84cc636 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2514,7 +2514,6 @@ static void rpi_execute_transform(HEVCContext *s)
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-
-+     gpu_cache_flush(&s->coeffs_buf_accelerated);
-+     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-+     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+@@ -2656,6 +2655,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-+             int x;
-+             // Transform all blocks
-++            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-++
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index d1c3e20..85f49db 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -199,6 +199,17 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   return 0;
-+ }
-+ 
-++// Returns 1 if the gpu is currently idle
-++static int gpu_idle(void)
-++{
-++  int ret = pthread_mutex_trylock(&gpu_mutex);
-++  if (ret==0) {
-++    pthread_mutex_unlock(&gpu_mutex);
-++    return 1;
-++  }
-++  return 0;
-++}
-++
-+ // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+ static void gpu_lock(void) {
-+   pthread_mutex_lock(&gpu_mutex);
-+@@ -400,6 +411,13 @@ static void *vpu_start(void *arg) {
-+ // Returns an id which we can use to wait for completion
-+ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
-+ {
-++  // If the gpu is idle then just run the command immediately
-++  // This works, but doesn't seem to give any benefit
-++  // if (gpu_idle()) {
-++  //   vpu_execute_code( code,  r0,  r1,  r2,  r3,  r4,  r5);
-++  //   return -1; // TODO perhaps a wraparound bug here?
-++  // }
-++
-+   pthread_mutex_lock(&post_mutex);
-+   {
-+     int id = vpu_async_tail++;
-+-- 
-+2.5.0
-+
-+
-+From 44d05d44ab3f81fec1ba75082ca2fe9340cb229c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 7 May 2015 11:01:35 +0100
-+Subject: [PATCH 13/68] Added deblocking pass
-+
-+---
-+ libavcodec/hevc.c        | 33 +++++++++++++++++++++++++++------
-+ libavcodec/hevc.h        |  7 ++++++-
-+ libavcodec/hevc_filter.c |  6 +++++-
-+ libavcodec/rpi_qpu.c     |  2 +-
-+ 4 files changed, 39 insertions(+), 9 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 84cc636..57b0b63 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2505,6 +2505,17 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+ }
-+ 
-+ #ifdef RPI
-++static void rpi_execute_dblk_cmds(HEVCContext *s)
-++{
-++    int n;
-++    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-++    int (*p)[2] = s->dblk_cmds;
-++    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
-++        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
-++    }
-++    s->num_dblk_cmds = 0;
-++}
-++
-+ static void rpi_execute_transform(HEVCContext *s)
-+ {
-+     int i=2;
-+@@ -2618,7 +2629,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-+ #ifdef RPI
-+-    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-+     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-+ #endif
-+ 
-+@@ -2652,7 +2662,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ #ifdef RPI
-+-        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-++        if (s->enable_rpi) {
-++          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-++          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-++          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+             int x;
-+             // Transform all blocks
-+             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+@@ -2665,10 +2678,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+             // Perform intra prediction and residual reconstruction
-+             rpi_execute_pred_cmds(s);
-+             // Perform deblocking for CTBs in this row
-+-            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
-+-                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
-+-            }
-+-            start_ctb_x = 0;
-++            rpi_execute_dblk_cmds(s);
-++          }
-+         }
-+ #endif
-+         if (more_data < 0) {
-+@@ -2686,6 +2697,16 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
-+     }
-+ 
-++#ifdef RPI
-++    if (s->enable_rpi && s->num_dblk_cmds) {
-++        rpi_execute_transform(s);
-++        rpi_execute_inter_cmds(s);
-++        vpu_wait(s->vpu_id);
-++        rpi_execute_pred_cmds(s);
-++        rpi_execute_dblk_cmds(s);
-++    }
-++#endif
-++
-+     if (x_ctb + ctb_size >= s->ps.sps->width &&
-+         y_ctb + ctb_size >= s->ps.sps->height)
-+         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index d5d3302..0b4c175 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -834,6 +834,8 @@ typedef struct HEVCLocalContext {
-+ #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-+ // Each block can have an intra prediction and a transform_add command
-+ #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-++// Worst case is 16x16 CTUs
-++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
-+ 
-+ #define RPI_CMD_LUMA_UNI 0
-+ #define RPI_CMD_CHROMA_UNI 1
-+@@ -893,6 +895,9 @@ typedef struct HEVCPredCmd {
-+ #endif
-+ 
-+ typedef struct HEVCContext {
-++#ifdef RPI
-++    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
-++#endif
-+     const AVClass *c;  // needed by private avoptions
-+     AVCodecContext *avctx;
-+ 
-+@@ -917,11 +922,11 @@ typedef struct HEVCContext {
-+     GPU_MEM_PTR_T coeffs_buf_accelerated;
-+     int16_t *coeffs_buf_arm[4];
-+     unsigned int coeffs_buf_vc[4];
-+-
-+     int num_coeffs[4];
-+     int num_xfm_cmds;
-+     int num_mv_cmds;
-+     int num_pred_cmds;
-++    int num_dblk_cmds;
-+     int vpu_id;
-+ #endif
-+ 
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index e4c3da7..ea0af91 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -877,8 +877,12 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             if (s->threads_type & FF_THREAD_FRAME )
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+         }
-+-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
-++    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
-++        int newh = y + ctb_size - 4;
-++        //int currh = s->ref->tf.progress->data[0];
-++        //if (((y + ctb_size)&63)==0)
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-++    }
-+ }
-+ 
-+ void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 85f49db..3b6dae7 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -105,7 +105,7 @@ struct GPU
-+ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ static volatile struct GPU* gpu = NULL;
-+ 
-+-#ifdef RPI_TIME_TOTAL_QPU
-++#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
-+ static unsigned int Microseconds(void) {
-+     struct timespec ts;
-+     unsigned int x;
-+-- 
-+2.5.0
-+
-+
-+From c4e1242d732ea2a14ce7cee5fb36e79bd2d8db35 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 7 May 2015 16:47:47 +0100
-+Subject: [PATCH 14/68] Added option to disable deblocking for non-ref frames
-+
-+---
-+ libavcodec/hevc_filter.c | 10 ++++++++++
-+ 1 file changed, 10 insertions(+)
-+
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index ea0af91..2cdd621 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -25,6 +25,8 @@
-+ //#define DISABLE_SAO
-+ //#define DISABLE_DEBLOCK
-+ //#define DISABLE_STRENGTHS
-++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
-++//#define DISABLE_DEBLOCK_NONREF
-+ 
-+ #include "libavutil/common.h"
-+ #include "libavutil/internal.h"
-+@@ -504,6 +506,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                 s->ps.sps->pcm.loop_filter_disable_flag) ||
-+                s->ps.pps->transquant_bypass_enable_flag;
-+ 
-++#ifdef DISABLE_DEBLOCK_NONREF
-++    if (    s->nal_unit_type == NAL_TRAIL_N ||
-++            s->nal_unit_type == NAL_TSA_N   ||
-++            s->nal_unit_type == NAL_STSA_N  ||
-++            s->nal_unit_type == NAL_RADL_N  ||
-++            s->nal_unit_type == NAL_RASL_N )
-++      return; // Don't deblock non-reference frames
-++#endif
-+ #ifdef DISABLE_DEBLOCK
-+     return;
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From 8b2f1cd9b31d0c1ded0b00d4106b18897c1450e5 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 11 May 2015 10:00:27 +0100
-+Subject: [PATCH 15/68] Moved buffers to VPU memory
-+
-+---
-+ libavcodec/hevc_filter.c | 17 +++++++++++++-
-+ libavcodec/utils.c       | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
-+ libavutil/buffer.c       |  6 +++++
-+ libavutil/buffer.h       |  3 +++
-+ 4 files changed, 84 insertions(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 2cdd621..e1b32d4 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -866,6 +866,13 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+ #undef CB
-+ #undef CR
-+ 
-++#ifdef RPI_INTER_QPU
-++static void flush_buffer(AVBufferRef *bref) {
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++    gpu_cache_flush(p);
-++}
-++#endif
-++
-+ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+ {
-+     int x_end = x >= s->ps.sps->width  - ctb_size;
-+@@ -888,9 +895,17 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+         }
-+     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
-+-        int newh = y + ctb_size - 4;
-++        //int newh = y + ctb_size - 4;
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-++        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
-++            s->nal_unit_type == NAL_TSA_N   ||
-++            s->nal_unit_type == NAL_STSA_N  ||
-++            s->nal_unit_type == NAL_RADL_N  ||
-++            s->nal_unit_type == NAL_RASL_N )) {
-++            flush_buffer(s->frame->buf[1]);
-++            flush_buffer(s->frame->buf[2]);
-++        }
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+ }
-+diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-+index 892ddb9..df750a8 100644
-+--- a/libavcodec/utils.c
-++++ b/libavcodec/utils.c
-+@@ -26,6 +26,12 @@
-+  */
-+ 
-+ #include "config.h"
-++
-++#ifdef RPI
-++// Move video buffers to GPU memory
-++#define RPI_GPU_BUFFERS
-++#endif
-++
-+ #include "libavutil/atomic.h"
-+ #include "libavutil/attributes.h"
-+ #include "libavutil/avassert.h"
-+@@ -70,6 +76,10 @@
-+ #include "libavutil/ffversion.h"
-+ const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
-+ 
-++#ifdef RPI_GPU_BUFFERS
-++#include "rpi_qpu.h"
-++#endif
-++
-+ #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
-+ static int default_lockmgr_cb(void **arg, enum AVLockOp op)
-+ {
-+@@ -505,6 +515,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
-+     return ret;
-+ }
-+ 
-++#ifdef RPI_GPU_BUFFERS
-++static void rpi_buffer_default_free(void *opaque, uint8_t *data)
-++{
-++    GPU_MEM_PTR_T *p = opaque;
-++    gpu_free(p);
-++    av_free(p);
-++}
-++
-++static AVBufferRef *rpi_buffer_alloc(int size)
-++{
-++    AVBufferRef *ret = NULL;
-++    uint8_t    *data = NULL;
-++    GPU_MEM_PTR_T *p;
-++
-++    static int total=0;
-++    total+=size;
-++
-++    p = av_malloc(sizeof *p);
-++    if (!p)
-++        return NULL;
-++
-++    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
-++        return NULL;
-++
-++    data = p->arm;
-++    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
-++    //memset(data, 64, size);
-++
-++    if (!data)
-++        return NULL;
-++
-++    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
-++    if (!ret) {
-++        gpu_free(p);
-++        av_freep(&p);
-++    }
-++
-++    return ret;
-++}
-++#endif
-++
-+ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
-+ {
-+     FramePool *pool = avctx->internal->pool;
-+@@ -549,6 +600,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
-+             av_buffer_pool_uninit(&pool->pools[i]);
-+             pool->linesize[i] = picture.linesize[i];
-+             if (size[i]) {
-++#ifdef RPI_GPU_BUFFERS
-++                if (avctx->codec_id == AV_CODEC_ID_HEVC)
-++                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
-++                                                     CONFIG_MEMORY_POISONING ?
-++                                                        NULL :
-++                                                        rpi_buffer_alloc);
-++                else
-++#endif
-+                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
-+                                                      CONFIG_MEMORY_POISONING ?
-+                                                         NULL :
-+diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-+index bb112c2..7f8bfab 100644
-+--- a/libavutil/buffer.c
-++++ b/libavutil/buffer.c
-+@@ -400,3 +400,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
-+ 
-+     return ret;
-+ }
-++
-++// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
-++void *av_buffer_pool_opaque(AVBufferRef *ref) {
-++  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
-++  return buf->opaque;
-++}
-+diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-+index b4399fd..0489002 100644
-+--- a/libavutil/buffer.h
-++++ b/libavutil/buffer.h
-+@@ -267,6 +267,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
-+  */
-+ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
-+ 
-++// Return the opaque for the underlying frame
-++void *av_buffer_pool_opaque(AVBufferRef *ref);
-++
-+ /**
-+  * @}
-+  */
-+-- 
-+2.5.0
-+
-+
-+From a51c8db9d5ed7d90ad83d7791dd8924911a88bd7 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 11 May 2015 14:04:37 +0100
-+Subject: [PATCH 16/68] Prepared QPU execute code
-+
-+---
-+ libavcodec/hevc.c        | 227 ++++++++++++++++++++++++++++++++++++++++-------
-+ libavcodec/hevc.h        |  22 ++++-
-+ libavcodec/hevc_filter.c |   7 +-
-+ libavcodec/rpi_qpu.c     |  55 +++++++++++-
-+ libavcodec/rpi_qpu.h     |   2 +
-+ 5 files changed, 276 insertions(+), 37 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 57b0b63..d055b47 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -40,17 +40,45 @@
-+ #include "hevc.h"
-+ 
-+ #ifdef RPI
-+-#include "rpi_qpu.h"
-+-// For some unknown reason, the code seems to crash if I do a late malloc
-+-#define EARLY_MALLOC
-+-// Move Inter prediction into separate pass
-+-#define RPI_INTER
-++  #include "rpi_qpu.h"
-++  // For some unknown reason, the code seems to crash if I do a late malloc
-++  #define EARLY_MALLOC
-++  // Move Inter prediction into separate pass
-++  #define RPI_INTER
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+ 
-+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ 
-++
-++#ifdef RPI_INTER_QPU
-++
-++#define RPI_CHROMA_COMMAND_WORDS 12
-++// The QPU code for UV blocks only works up to a block width of 8
-++#define RPI_CHROMA_BLOCK_WIDTH 8
-++
-++#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
-++
-++// TODO Chroma only needs 4 taps
-++static uint32_t rpi_filter_coefs[8][2] = {
-++        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
-++};
-++
-++static uint32_t get_vc_address(AVBufferRef *bref) {
-++  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++  return p->vc;
-++}
-++
-++#endif
-++
-+ /**
-+  * NOTE: Each function hls_foo correspond to the function foo in the
-+  * specification (HLS stands for High Level Syntax).
-+@@ -64,6 +92,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ static void pic_arrays_free(HEVCContext *s)
-+ {
-+ #ifdef RPI
-++
-+ #ifdef EARLY_MALLOC
-+ #else
-+     printf("pic_arrays_free\n");
-+@@ -1969,6 +1998,43 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-++#ifdef RPI_INTER_QPU
-++            if (s->enable_rpi) {
-++                int reflist = 0;
-++                int hshift           = s->ps.sps->hshift[1];
-++                int vshift           = s->ps.sps->vshift[1];
-++                const Mv *mv         = &current_mv.mv[reflist];
-++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-++                intptr_t _mx         = mx << (1 - hshift);
-++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-++
-++                int x1_c = x0_c + (mv->x >> (2 + hshift));
-++                int y1_c = y0_c + (mv->y >> (2 + hshift));
-++                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++
-++                uint32_t *u = s->u_mvs[chan & 7];
-++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-++                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-++                      *u++ = rpi_filter_coefs[_mx][0];
-++                      *u++ = rpi_filter_coefs[_mx][1];
-++                      *u++ = rpi_filter_coefs[_my][0];
-++                      *u++ = rpi_filter_coefs[_my][1];
-++                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-++                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++                    }
-++                }
-++                s->u_mvs[chan & 7] = u;
-++                return;
-++            }
-++#endif
-+             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-+                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-+@@ -2619,6 +2685,54 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ 
-+ #endif
-+ 
-++#ifdef RPI_INTER_QPU
-++static void rpi_inter_clear(HEVCContext *s)
-++{
-++    int i;
-++    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-++    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-++    for(i=0;i<8;i++) {
-++        s->u_mvs[i] = s->mvs_base[i];
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = pic_width;
-++        *s->u_mvs[i]++ = pic_height;
-++        *s->u_mvs[i]++ = s->frame->linesize[1];
-++        *s->u_mvs[i]++ = s->frame->linesize[2];
-++        s->u_mvs[i] += 3;  // Padding words
-++    }
-++}
-++
-++static void rpi_execute_inter_qpu(HEVCContext *s)
-++{
-++    int k;
-++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-++
-++    if (s->sh.slice_type == I_SLICE)
-++        return;
-++    for(k=0;k<8;k++) {
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++    }
-++
-++    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++
-++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-++      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++      );
-++}
-++#endif
-++
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ {
-+     HEVCContext *s  = avctxt->priv_data;
-+@@ -2645,6 +2759,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         }
-+     }
-+ 
-++#ifdef RPI_INTER_QPU
-++    rpi_inter_clear(s);
-++#endif
-++
-+     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-+         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+ 
-+@@ -2666,19 +2784,30 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-+           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-+           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+-            int x;
-++#ifdef RPI_INTER_QPU
-++            // Kick off inter prediction on QPUs
-++            rpi_execute_inter_qpu(s);
-++#endif
-+             // Transform all blocks
-+             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+-
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-+             // Wait for transform completion
-+             vpu_wait(s->vpu_id);
-++
-++            // Copy back reconstructed data
-++            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
-++            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
-++            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
-++
-+             // Perform intra prediction and residual reconstruction
-+             rpi_execute_pred_cmds(s);
-+             // Perform deblocking for CTBs in this row
-+             rpi_execute_dblk_cmds(s);
-++#ifdef RPI_INTER_QPU
-++            rpi_inter_clear(s);
-++#endif
-+           }
-+         }
-+ #endif
-+@@ -2699,6 +2828,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+ #ifdef RPI
-+     if (s->enable_rpi && s->num_dblk_cmds) {
-++#ifdef RPI_INTER_QPU
-++        rpi_execute_inter_qpu(s);
-++#endif
-+         rpi_execute_transform(s);
-+         rpi_execute_inter_cmds(s);
-+         vpu_wait(s->vpu_id);
-+@@ -3374,6 +3506,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->unif_xfm_cmds);
-+     av_freep(&s->univ_pred_cmds);
-+ 
-++#ifdef RPI_INTER_QPU
-++    if (s->unif_mvs) {
-++        gpu_free( &s->unif_mvs_ptr );
-++        s->unif_mvs = 0;
-++    }
-++#endif
-++    //gpu_free(&s->dummy);
-++
-+ #ifdef EARLY_MALLOC
-+     printf("hevc_decode_free\n");
-+     if (s->coeffs_buf_arm[0]) {
-+@@ -3469,34 +3609,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+ 
-+-    s->coeffs_buf_arm[0] = 0;
-+-    s->coeffs_buf_arm[2] = 0;
-++#ifdef RPI_INTER_QPU
-++    // We divide the image into blocks 256 wide and 64 high
-++    // We support up to 2048 widths
-++    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
-++    // Also add space for the startup command for each stream.
-++
-++    {
-++        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-++        uint32_t *p;
-++        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-++
-++        // Set up initial locations for uniform streams
-++        p = s->unif_mvs;
-++        for(i = 0; i < 8; i++) {
-++            s->mvs_base[i] = p;
-++            p += uv_commands_per_qpu;
-++        }
-++        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-++        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
-++
-++    }
-++#endif
-++    //gpu_malloc_uncached(2048*64,&s->dummy);
-+ 
-+ #ifdef EARLY_MALLOC
-+-    int coeffs_in_ctb = 64*64;
-+-    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-+-    printf("Allocated %d\n",coefs_per_row);
-+-    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+-    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+-    if (!s->coeffs_buf_arm[0])
-+-        goto fail;
-+-    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+-    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+-    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+-    if (!s->coeffs_buf_arm[2])
-+-        goto fail;
-+-    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+-    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+-    printf("Done\n");
-++    {
-++        int coeffs_in_ctb = 64*64;
-++        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-++        s->coeffs_buf_arm[0] = 0;
-++        s->coeffs_buf_arm[2] = 0;
-++        printf("Allocated %d\n",coefs_per_row);
-++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-++        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-++        if (!s->coeffs_buf_arm[0])
-++            goto fail;
-++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-++        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-++        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-++        if (!s->coeffs_buf_arm[2])
-++            goto fail;
-++        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-++        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-++        printf("Done\n");
-+ #ifdef RPI_PRECLEAR
-+-    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-+-    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-+-    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-+-    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-+-    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-+-    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-++        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-++        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-++        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-++        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-++        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-++        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-+ #endif
-+-
-++    }
-+ #endif
-+ 
-+     s->enable_rpi = 0;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 0b4c175..8923a25 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -41,7 +41,11 @@
-+ 
-+ // define RPI to split the CABAC/prediction/transform into separate stages
-+ #ifdef RPI
-+-#include "rpi_qpu.h"
-++
-++  #include "rpi_qpu.h"
-++  // Use QPU for inter prediction
-++  //#define RPI_INTER_QPU
-++
-+ #endif
-+ 
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+@@ -914,7 +918,7 @@ typedef struct HEVCContext {
-+ 
-+ #ifdef RPI
-+     int enable_rpi;
-+-    HEVCMvCmd *unif_mv_cmds;
-++    HEVCMvCmd *unif_mv_cmds;  // TODO rename
-+     HEVCXfmCmd *unif_xfm_cmds;
-+     HEVCPredCmd *univ_pred_cmds;
-+     int buf_width;
-+@@ -928,6 +932,20 @@ typedef struct HEVCContext {
-+     int num_pred_cmds;
-+     int num_dblk_cmds;
-+     int vpu_id;
-++    //GPU_MEM_PTR_T dummy;
-++#ifdef RPI_INTER_QPU
-++    GPU_MEM_PTR_T unif_mvs_ptr;
-++    uint32_t *unif_mvs; // Base of memory for motion vector commands
-++
-++    // _base pointers are to the start of the row
-++    uint32_t *mvs_base[8];
-++    // these pointers are to the next free space
-++    uint32_t *u_mvs[8];
-++    // Function pointers
-++    uint32_t mc_filter_uv;
-++    uint32_t mc_filter_uv_b;
-++#endif
-++
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index e1b32d4..5b3d759 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -903,8 +903,11 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             s->nal_unit_type == NAL_STSA_N  ||
-+             s->nal_unit_type == NAL_RADL_N  ||
-+             s->nal_unit_type == NAL_RASL_N )) {
-+-            flush_buffer(s->frame->buf[1]);
-+-            flush_buffer(s->frame->buf[2]);
-++            //flush_buffer(s->frame->buf[1]);
-++            //flush_buffer(s->frame->buf[2]);
-++            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-++            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-++            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+         }
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 3b6dae7..e4dd58a 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -3,7 +3,7 @@
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+ #define RPI_USE_VCSM
-+ // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+-//#define RPI_TIME_TOTAL_QPU
-++#define RPI_TIME_TOTAL_QPU
-+ // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+ //#define RPI_TIME_TOTAL_VPU
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+@@ -30,7 +30,7 @@
-+ #endif
-+ 
-+ // On Pi2 there is no way to access the VPU L2 cache
-+-// GPU_MEM_FLG should be 4 for uncached memory.
-++// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+ // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-+ // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-+ #define GPU_MEM_FLG 0xC
-+@@ -549,6 +549,54 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
-+   gpu_unlock();
-+ }
-+ 
-++// Run a program on 8 QPUs with the given code and uniform stream (given in GPU addresses)
-++void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-++{
-++  int i;
-++#ifdef RPI_TIME_TOTAL_QPU
-++  static int last_time=0;
-++  static long long on_time=0;
-++  static long long off_time=0;
-++  int start_time;
-++  int end_time;
-++  static int count=0;
-++#endif
-++
-++  gpu_lock();
-++#ifdef RPI_TIME_TOTAL_QPU
-++  start_time = Microseconds();
-++  if (last_time==0)
-++    last_time = start_time;
-++  off_time += start_time-last_time;
-++#endif
-++  for(i=0;i<8;i++) {
-++    gpu->mail[i*2 + 1] = code;
-++  }
-++  gpu->mail[0 ] = unifs1;
-++  gpu->mail[2 ] = unifs2;
-++  gpu->mail[4 ] = unifs3;
-++  gpu->mail[6 ] = unifs4;
-++  gpu->mail[8 ] = unifs5;
-++  gpu->mail[10] = unifs6;
-++	gpu->mail[12] = unifs7;
-++	gpu->mail[14] = unifs8;
-++	execute_qpu(
-++		gpu->mb,
-++		8 /* Number of QPUs */,
-++		gpu->vc + offsetof(struct GPU, mail),
-++		1 /* no flush */,  // Don't flush VPU L1 cache
-++		5000 /* timeout ms */);
-++#ifdef RPI_TIME_TOTAL_QPU
-++  end_time = Microseconds();
-++  last_time = end_time;
-++  on_time += end_time - start_time;
-++  count++;
-++  if ((count&0x7f)==0)
-++    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-++  gpu_unlock();
-++}
-++
-+ unsigned int qpu_get_fn(int num) {
-+     // Make sure that the gpu is initialized
-+     unsigned int *fn;
-+@@ -585,6 +633,9 @@ unsigned int qpu_get_fn(int num) {
-+     case QPU_MC_FILTER_UV_B:
-+       fn = mc_filter_uv_b;
-+       break;
-++    case QPU_MC_INTERRUPT_EXIT8:
-++      fn = mc_interrupt_exit8;
-++      break;
-+     case QPU_MC_END:
-+       fn = mc_end;
-+       break;
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 3526fce..2b22d98 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -16,6 +16,7 @@ extern void gpu_free(GPU_MEM_PTR_T *p);
-+ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-+ 
-+ // QPU specific functions
-++extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+ extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
-+ 
-+ enum {
-+@@ -28,6 +29,7 @@ enum {
-+   QPU_MC_SETUP_UV,
-+   QPU_MC_FILTER_UV,
-+   QPU_MC_FILTER_UV_B,
-++  QPU_MC_INTERRUPT_EXIT8,
-+   QPU_MC_END
-+   };
-+ extern unsigned int qpu_get_fn(int num);
-+-- 
-+2.5.0
-+
-+
-+From 5fc9797992781c83747eadba05b8092cd85ebba7 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 11:47:23 +0100
-+Subject: [PATCH 17/68] Drafted chroma interpolation on QPUs
-+
-+---
-+ libavcodec/hevc.c          |   5 ++-
-+ libavcodec/hevc.h          |   2 +-
-+ libavcodec/hevc_filter.c   |   6 ++-
-+ libavcodec/rpi_qpu.c       | 101 +++++++++++++++++++++++++++++++++++++++++++--
-+ libavcodec/rpi_qpu.h       |   1 +
-+ libavcodec/rpi_shader.c    |  42 +++++++++----------
-+ libavcodec/rpi_shader.qasm |  42 +++++++++----------
-+ 7 files changed, 149 insertions(+), 50 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index d055b47..7897fdd 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -58,11 +58,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+ 
-+-#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
-++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+ // TODO Chroma only needs 4 taps
-+ static uint32_t rpi_filter_coefs[8][2] = {
-+-        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-+         { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
-+         { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
-+         { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
-+@@ -2716,6 +2716,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+     for(k=0;k<8;k++) {
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-+     }
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 8923a25..a0d4631 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -44,7 +44,7 @@
-+ 
-+   #include "rpi_qpu.h"
-+   // Use QPU for inter prediction
-+-  //#define RPI_INTER_QPU
-++  // #define RPI_INTER_QPU
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 5b3d759..9b6e26d 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -903,8 +903,10 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             s->nal_unit_type == NAL_STSA_N  ||
-+             s->nal_unit_type == NAL_RADL_N  ||
-+             s->nal_unit_type == NAL_RASL_N )) {
-+-            //flush_buffer(s->frame->buf[1]);
-+-            //flush_buffer(s->frame->buf[2]);
-++#ifdef RPI_INTER_QPU
-++            flush_buffer(s->frame->buf[1]);
-++            flush_buffer(s->frame->buf[2]);
-++#endif
-+             //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+             //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+             //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index e4dd58a..4d9eda8 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -33,7 +33,8 @@
-+ // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+ // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-+ // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-+-#define GPU_MEM_FLG 0xC
-++#define GPU_MEM_FLG 0x4
-++// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
-+ #define GPU_MEM_MAP 0x0
-+ 
-+ #define vcos_verify(x) ((x)>=0)
-+@@ -165,6 +166,8 @@ static int gpu_init(volatile struct GPU **gpu) {
-+ 	ptr->vc_handle = handle;
-+ 	ptr->vc = vc;
-+ 
-++  printf("GPU allocated at 0x%x\n",vc);
-++
-+   *gpu = ptr;
-+ 
-+   // Now copy over the QPU code into GPU memory
-+@@ -304,10 +307,13 @@ int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
-+ 
-+ static void gpu_term(void)
-+ {
-+-	int mb = gpu->mb;
-+-	unsigned handle = gpu->vc_handle;
-++	int mb;
-++	unsigned handle;
-++
-+   if (gpu==NULL)
-+     return;
-++  mb = gpu->mb;
-++  handle = gpu->vc_handle;
-+ 
-+ #ifdef RPI_ASYNC
-+   {
-+@@ -648,6 +654,95 @@ unsigned int qpu_get_fn(int num) {
-+ }
-+ 
-+ #if 0
-++typedef unsigned int uint32_t;
-++
-++typedef struct mvs_s {
-++    GPU_MEM_PTR_T unif_mvs_ptr;
-++    uint32_t *unif_mvs; // Base of memory for motion vector commands
-++
-++    // _base pointers are to the start of the row
-++    uint32_t *mvs_base[8];
-++    // these pointers are to the next free space
-++    uint32_t *u_mvs[8];
-++
-++} HEVCContext;
-++
-++#define RPI_CHROMA_COMMAND_WORDS 12
-++
-++static void rpi_inter_clear(HEVCContext *s)
-++{
-++    int i;
-++    for(i=0;i<8;i++) {
-++        s->u_mvs[i] = s->mvs_base[i];
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 128;  // w
-++        *s->u_mvs[i]++ = 128;  // h
-++        *s->u_mvs[i]++ = 128;  // stride u
-++        *s->u_mvs[i]++ = 128;  // stride v
-++        s->u_mvs[i] += 3;  // Padding words
-++    }
-++}
-++
-++static void rpi_execute_inter_qpu(HEVCContext *s)
-++{
-++    int k;
-++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-++
-++    for(k=0;k<8;k++) {
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
-++    }
-++
-++    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++
-++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-++      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++      );
-++}
-++
-++void rpi_test_qpu(void)
-++{
-++    HEVCContext mvs;
-++    HEVCContext *s = &mvs;
-++    int i;
-++    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-++    uint32_t *p;
-++    printf("Allocate memory\n");
-++    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
-++
-++    // Set up initial locations for uniform streams
-++    p = s->unif_mvs;
-++    for(i = 0; i < 8; i++) {
-++        s->mvs_base[i] = p;
-++        p += uv_commands_per_qpu;
-++    }
-++    // Now run a simple program that should just quit immediately after a single texture fetch
-++    rpi_inter_clear(s);
-++    for(i=0;i<4;i++) {
-++      printf("Launch QPUs\n");
-++      rpi_execute_inter_qpu(s);
-++      printf("Done\n");
-++    }
-++    printf("Free memory\n");
-++    gpu_free(&s->unif_mvs_ptr);
-++    return;
-++}
-++#endif
-++
-++#if 0
-+ 
-+ int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-+ //int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 2b22d98..f9ad333 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -18,6 +18,7 @@ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-+ // QPU specific functions
-+ extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+ extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
-++extern void rpi_test_qpu(void);
-+ 
-+ enum {
-+   QPU_MC_SETUP,
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 41cc2e1..d7ed297 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -152,23 +152,23 @@ unsigned int rpi_shader[] = {
-+ /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+ /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+ /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
-++/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ /* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ /* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+ /* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+ /* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+ /* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+ /* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+ /* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+ /* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+ /* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+@@ -179,20 +179,20 @@ unsigned int rpi_shader[] = {
-+ /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+ /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+ /* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
-+ /* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
-+ /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ /* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+ /* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+ /* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+ /* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 6851e83..02fdcb2 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -270,23 +270,23 @@ add t0s, ra_x2_base, r2
-+ 
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++mov r2, rb21         ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+ nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+ nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+ nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+ nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3
-++add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+@@ -302,23 +302,23 @@ sub.setf -, r3, 8 ; mov r1, ra22
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop
-+ max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-++asr r0, r0, 14          ; mov r1, ra21
-+ min.setf ra15, r0, rb22
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++nop                     ; mul24 r1, ra14, rb14
-++nop                     ; mul24 r0, ra13, rb13
-++add r1, r1, r0          ; mul24 r0, ra12, rb12
-++add r1, r1, r0          ; mul24 r0, ra11, rb11
-++add r1, r1, r0          ; mul24 r0, ra10, rb10
-++add r1, r1, r0          ; mul24 r0, ra9, rb9
-++add r1, r1, r0          ; mul24 r0, ra8, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb15
-++add.ifnn r1, r1, r0     ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ brr.anyn -, r:uvloop
-+-asr r1, r1, 15
-++asr r1, r1, 14
-+ min r1, r1, rb22
-+ max vpm, r1, 0
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 41380ff18142eef6a80ffae43f0c3d810c9384d8 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 13:54:11 +0100
-+Subject: [PATCH 18/68] Fixed chroma inter prediction
-+
-+---
-+ libavcodec/hevc.c          |    8 +-
-+ libavcodec/hevc.h          |    2 +-
-+ libavcodec/rpi_shader.c    | 1170 ++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |   22 +-
-+ libavcodec/rpi_shader.qasm |   24 +-
-+ 5 files changed, 617 insertions(+), 609 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 7897fdd..bcc831e 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -55,9 +55,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ #ifdef RPI_INTER_QPU
-+ 
-+ #define RPI_CHROMA_COMMAND_WORDS 12
-++#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+ 
-++
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+ // TODO Chroma only needs 4 taps
-+@@ -2011,7 +2013,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+ 
-+                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+-                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++                int chan = x0>>8;
-+ 
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+@@ -2717,6 +2720,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-++        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-+     }
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+@@ -3617,7 +3621,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     // Also add space for the startup command for each stream.
-+ 
-+     {
-+-        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-++        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+         uint32_t *p;
-+         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index a0d4631..cae6659 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -44,7 +44,7 @@
-+ 
-+   #include "rpi_qpu.h"
-+   // Use QPU for inter prediction
-+-  // #define RPI_INTER_QPU
-++  #define RPI_INTER_QPU
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index d7ed297..831633b 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -33,7 +33,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+ /* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+ /* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
-++/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+ /* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+ /* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+ /* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+@@ -152,7 +152,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+ /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+ /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
-++/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+ /* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+ /* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+@@ -178,9 +178,9 @@ unsigned int rpi_shader[] = {
-+ /* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+ /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+ /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
-+-/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+ /* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+ /* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+ /* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+@@ -189,400 +189,400 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+ /* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+ /* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+ /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter
-+-/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-+-/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-++/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :loop
-+-/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-+-/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-+-/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-++/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-++/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // :fast_path
-+-/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :fast_loop
-+-/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+-/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+-/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-+-/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+-/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+-/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+-/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+-/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+-/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+-/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+-/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+-/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-+-/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-+-/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-+-/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-++/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-++/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-++/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-++/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-++/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-++/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-++/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-++/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-++/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-++/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-++/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-++/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-++/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-++/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-++/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_b
-+-/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+-/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :bloop
-+-/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-+-/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-+-/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-+-/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-++/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-++/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-++/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_honly
-+-/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-+-/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-+-/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-++/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-++/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :loop_honly
-+-/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-+-/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-+-/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-+-/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-+-/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-+-/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-+-/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-++/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-++/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-++/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-++/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-++/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-++/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-++/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+ /* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_exit1
-+-/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit
-+-/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+@@ -592,225 +592,227 @@ unsigned int rpi_shader[] = {
-+ /* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit4
-+-/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_setup_uv
-+-/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+-/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+-/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+-/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-+-/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
-+-/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+-/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-++/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-++/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-++/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-++/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
-++/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-++/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-++/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-++/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+ /* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv_b
-+-/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+-/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index db971f4..3464cdb 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -5,16 +5,16 @@ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 146)
-+-#define mc_filter (rpi_shader + 360)
-+-#define mc_filter_b (rpi_shader + 670)
-+-#define mc_filter_honly (rpi_shader + 894)
-+-#define mc_exit (rpi_shader + 1048)
-+-#define mc_exit1 (rpi_shader + 1066)
-+-#define mc_interrupt_exit (rpi_shader + 1082)
-+-#define mc_interrupt_exit4 (rpi_shader + 1120)
-+-#define mc_interrupt_exit8 (rpi_shader + 1142)
-+-#define mc_setup_uv (rpi_shader + 1172)
-+-#define mc_filter_uv_b (rpi_shader + 1314)
-+-#define mc_end (rpi_shader + 1542)
-++#define mc_filter (rpi_shader + 364)
-++#define mc_filter_b (rpi_shader + 674)
-++#define mc_filter_honly (rpi_shader + 898)
-++#define mc_exit (rpi_shader + 1052)
-++#define mc_exit1 (rpi_shader + 1070)
-++#define mc_interrupt_exit (rpi_shader + 1086)
-++#define mc_interrupt_exit4 (rpi_shader + 1124)
-++#define mc_interrupt_exit8 (rpi_shader + 1146)
-++#define mc_setup_uv (rpi_shader + 1176)
-++#define mc_filter_uv_b (rpi_shader + 1318)
-++#define mc_end (rpi_shader + 1546)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 02fdcb2..4809e1d 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -21,7 +21,7 @@
-+ # rb19                                          next ra16
-+ #
-+ # ra20                                          1
-+-# ra21                                          64
-++# ra21                                          32
-+ # ra22                                          256
-+ # ra23                                          8
-+ #
-+@@ -97,7 +97,7 @@ add rb24, r1, r0
-+ # load constants
-+ 
-+ mov ra20, 1
-+-mov ra21, 64
-++mov ra21, 32
-+ mov ra22, 256
-+ mov ra23, 8
-+ 
-+@@ -270,7 +270,7 @@ add t0s, ra_x2_base, r2
-+ 
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+-mov r2, rb21         ; mul24 r2, r0, ra0
-++nop                  ; mul24 r2, r0, ra0
-+ nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+ nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+@@ -301,9 +301,9 @@ sub.setf -, r3, 8 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 14          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-++mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-++asr ra15, r0, 8         ; nop
-++nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+@@ -315,12 +315,14 @@ add r1, r1, r0          ; mul24 r0, ra10, rb10
-+ add r1, r1, r0          ; mul24 r0, ra9, rb9
-+ add r1, r1, r0          ; mul24 r0, ra8, rb8
-+ add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-add.ifnn r1, r1, r0     ; mov -, vw_wait
-++add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-brr.anyn -, r:uvloop
-+ asr r1, r1, 14
-+-min r1, r1, rb22
-+-max vpm, r1, 0
-++add r1, r1, ra21
-++brr.anyn -, r:uvloop
-++asr r1, r1, 6          # Delay 1
-++min r1, r1, rb22       # Delay 2
-++max vpm, r1, 0         # Delay 3
-+ 
-+ # DMA out for U
-+ 
-+@@ -1161,7 +1163,7 @@ add rb24, r1, r0
-+ # load constants
-+ 
-+ mov ra20, 1
-+-mov ra21, 64
-++mov ra21, 32
-+ mov ra22, 256
-+ mov ra23, 8
-+ 
-+-- 
-+2.5.0
-+
-+
-+From b558abbe8e70ebb5d75988e2cd21976474a2d4eb Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 14:37:32 +0100
-+Subject: [PATCH 19/68] Removed unused luma functions
-+
-+---
-+ libavcodec/hevc.c          |    4 +-
-+ libavcodec/rpi_qpu.c       |   32 +-
-+ libavcodec/rpi_shader.c    | 1097 +++++++++++++-------------------------------
-+ libavcodec/rpi_shader.h    |   19 +-
-+ libavcodec/rpi_shader.qasm |  970 +++------------------------------------
-+ 5 files changed, 396 insertions(+), 1726 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index bcc831e..3967361 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2718,8 +2718,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+         return;
-+     for(k=0;k<8;k++) {
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+         assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-+     }
-+ 
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 4d9eda8..4e90cc1 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -172,7 +172,7 @@ static int gpu_init(volatile struct GPU **gpu) {
-+ 
-+   // Now copy over the QPU code into GPU memory
-+   {
-+-    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
-++    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
-+     assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-+   }
-+@@ -612,24 +612,24 @@ unsigned int qpu_get_fn(int num) {
-+       gpu_unlock();
-+     }
-+     switch(num) {
-+-    case QPU_MC_SETUP:
-+-      fn = mc_setup;
-+-      break;
-+-    case QPU_MC_FILTER:
-+-      fn = mc_filter;
-+-      break;
-++    //case QPU_MC_SETUP:
-++    //  fn = mc_setup;
-++    //  break;
-++    //case QPU_MC_FILTER:
-++    //  fn = mc_filter;
-++    //  break;
-+     case QPU_MC_EXIT:
-+       fn = mc_exit;
-+       break;
-+-    case QPU_MC_INTERRUPT_EXIT:
-+-      fn = mc_interrupt_exit;
-+-      break;
-+-    case QPU_MC_FILTER_B:
-+-      fn = mc_filter_b;
-+-      break;
-+-    case QPU_MC_FILTER_HONLY:
-+-      fn = mc_filter_honly;
-+-      break;
-++    //case QPU_MC_INTERRUPT_EXIT:
-++    //  fn = mc_interrupt_exit;
-++    //  break;
-++    //case QPU_MC_FILTER_B:
-++    //  fn = mc_filter_b;
-++    //  break;
-++    //case QPU_MC_FILTER_HONLY:
-++    //  fn = mc_filter_honly;
-++    //  break;
-+     case QPU_MC_SETUP_UV:
-+       fn = mc_setup_uv;
-+       break;
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 831633b..170e8ac 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -21,798 +21,331 @@ __declspec(align(8))
-+ __attribute__((aligned(8)))
-+ #endif
-+ unsigned int rpi_shader[] = {
-+-// ::mc_setup
-++// ::mc_setup_uv
-+ /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+ /* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+ /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+ /* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+-/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+-/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+-/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-++/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
-++/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-++/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
-++/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+ /* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+ /* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
-+-/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
-+-/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_filter
-+-/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-+-/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :loop
-+-/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-+-/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-+-/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// :fast_path
-+-/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :fast_loop
-+-/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+-/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+-/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-+-/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+-/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+-/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+-/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+-/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+-/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+-/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+-/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+-/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+-/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-+-/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-+-/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-+-/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_filter_b
-+-/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+-/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :bloop
-+-/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-+-/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-+-/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-+-/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_filter_honly
-+-/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-+-/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-+-/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :loop_honly
-+-/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-+-/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-+-/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-+-/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-+-/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-+-/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-+-/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_exit
-+-/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_exit1
-+-/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_interrupt_exit
-+-/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_interrupt_exit4
-+-/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_interrupt_exit8
-+-/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_setup_uv
-+-/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+-/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+-/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+-/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-+-/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+-/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+-/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+-/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_exit
-++/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit8
-++/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 3464cdb..9de4535 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -3,18 +3,11 @@
-+ 
-+ extern unsigned int rpi_shader[];
-+ 
-+-#define mc_setup (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 146)
-+-#define mc_filter (rpi_shader + 364)
-+-#define mc_filter_b (rpi_shader + 674)
-+-#define mc_filter_honly (rpi_shader + 898)
-+-#define mc_exit (rpi_shader + 1052)
-+-#define mc_exit1 (rpi_shader + 1070)
-+-#define mc_interrupt_exit (rpi_shader + 1086)
-+-#define mc_interrupt_exit4 (rpi_shader + 1124)
-+-#define mc_interrupt_exit8 (rpi_shader + 1146)
-+-#define mc_setup_uv (rpi_shader + 1176)
-+-#define mc_filter_uv_b (rpi_shader + 1318)
-+-#define mc_end (rpi_shader + 1546)
-++#define mc_setup_uv (rpi_shader + 0)
-++#define mc_filter_uv (rpi_shader + 142)
-++#define mc_filter_uv_b (rpi_shader + 360)
-++#define mc_exit (rpi_shader + 588)
-++#define mc_interrupt_exit8 (rpi_shader + 606)
-++#define mc_end (rpi_shader + 636)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 4809e1d..cd7346d 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -71,8 +71,10 @@
-+ 
-+ .set rb_const_64,                  rb21
-+ 
-+-# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
-+-::mc_setup
-++
-++################################################################################
-++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-++::mc_setup_uv
-+ 
-+ # Read starting kernel
-+ mov ra31, unif
-+@@ -80,7 +82,9 @@ mov ra31, unif
-+ # Load first request location
-+ add ra_x_base, unif, elem_num # Store x
-+ mov ra_y, unif # Store y
-+-mov ra_x2_base, unif # Store frame base
-++mov ra_x2_base, unif # Store frame u base
-++nop
-++sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-+ 
-+ # Read image dimensions
-+ sub rb25,unif,1
-+@@ -143,29 +147,24 @@ mov r1, vpm_setup(0, 4, h8p(0, 0))
-+ add rb28, r0, r1
-+ 
-+ # Compute base address for first and second access
-+-#add r0, unif, elem_num     # x
-+ mov r0, ra_x_base           # Load x
-+-add r2, r0, 8               # x+8
-+ max r0, r0, 0; mov r1, ra_y # Load y
-+ min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-+-shl ra_xshift_next, r0, 3
-+-max r2, r2, 0
-++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+ add ra_y, r1, 1
-+-min r2, r2, rb_frame_width_minus_1
-+-shl ra_x2shift_next, r2, 3
-+-max r1, r1, 0  # y
-+-min r1, r1, rb_frame_height_minus_1
-+-add r0, r0, r3; mul24 r1, r1, rb_pitch
-+-add r2, r2, r3
-++add r0, r0, r3
-+ and r0, r0, ~3
-+-and r2, r2, ~3; mov ra_x_base, r0
-++max r1, r1, 0 ; mov ra_x_base, r0 # y
-++min r1, r1, rb_frame_height_minus_1
-+ # submit texture requests for first line
-++add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+ add t0s, r0, r1 ; mov ra_x2_base, r2
-+ add t0s, r2, r1
-+ 
-+ # Dump padding words
-+ mov r0, unif
-+ mov r0, unif
-++mov r0, unif
-+ 
-+ # submit texture requests for second line
-+ max r1, ra_y, 0
-+@@ -176,6 +175,8 @@ nop ; mul24 r1, r1, rb_pitch
-+ add t0s, r1, ra_x_base
-+ add t0s, r1, ra_x2_base
-+ 
-++
-++
-+ ################################################################################
-+ 
-+ # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+@@ -341,453 +342,26 @@ add vw_setup, rb26, r0 # VDW setup 0
-+ mov vw_setup, rb29 # Stride
-+ mov vw_addr, unif # start the VDW
-+ 
-+-################################################################################
-+-
-+-
-+-# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+-
-+-# At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-+-::mc_filter
-+-mov ra31, unif
-+-
-+-# per-channel shifts were calculated on the *previous* invocation
-+-
-+-mov ra_xshift, ra_xshift_next
-+-mov ra_x2shift, ra_x2shift_next
-+-
-+-# get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num    # x
-+-add r2, r0, 8 # x+8
-+-max r0, r0, 0; mov r1, unif # y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-+-shl ra_xshift_next, r0, 3
-+-max r2, r2, 0
-+-min r2, r2, rb_frame_width_minus_1
-+-shl ra_x2shift_next, r2, 3
-+-add r0, r0, r3
-+-add r2, r2, r3
-+-and rb_x_base_next, r0, ~3
-+-and ra_x2_base_next, r2, ~3
-+-mov ra_y_next, r1
-+-
-+-# set up VPM write
-+-mov vw_setup, rb28
-+-
-+-# get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-+-shl r0, r0, 7
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-+-
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+-# get filter coefficients
-+-
-+-mov r0, unif
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-+-asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-brr.anynn -, r:fast_path
-+-asr rb12, r0, rb23  # delay slot 1
-+-
-+-# r2 is elem_num
-+-# r3 is loop counter
-+-
-+-mov r5rep, -8 # delay slot 2
-+-
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-+-
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-## nop                                                                 ; ldtmu0     # loop counter increment
-+-## shr r0, r4, ra17                                                    ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21         ; mul24 r3, r0, ra0
-+-## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-## sub r2, r2, r3                                                    ; ldtmu0
-+-##
-+-## mov r0, ra22
-+-## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # apply horizontal filter
-+-##
-+-## asr r2, r2, 15    ; mul24 r3, r0, ra0
-+-## min r2, r2, rb22
-+-## max ra13, r2, 0
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21
-+-## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
-+-## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+-## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+-## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+-## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+-## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+-## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+-## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-## sub r0, r2, r3
-+-##
-+-## # apply horizontal filter
-+-##
-+-## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+-## asr r0, r0, 15
-+-## min r0, r0, rb22
-+-## max ra14, r0, 0
-+-##
-+-##
-+-##
-+-##
-+-## nop                                                                 ; ldtmu0     # loop counter increment
-+-## shr r0, r4, ra17                                                    ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21         ; mul24 r3, r0, ra0
-+-## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-## sub r0, r2, r3
-+-##
-+-## # apply horizontal filter
-+-##
-+-## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+-## asr r0, r0, 15
-+-## min r0, r0, rb22
-+-## max ra15, r0, 0
-+-
-+-
-+-
-+-
-+-mov r3, 0
-+-
-+-:loop
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-+-
-+-# generate seven shifted versions
-+-# interleave with scroll of vertical context
-+-
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 8 ; mov r1, ra22
-+-
-+-# apply horizontal filter
-+-brr.anyn -, r:loop
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-+-
-+-# apply vertical filter and write to VPM
-+-
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-brr.anyn -, r:loop
-+-asr r1, r1, 15
-+-min r1, r1, rb22
-+-max vpm, r1, 0
-+-
-+-# DMA out
-+-
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+-mov vw_setup, rb29
-+-mov vw_addr, unif # start the VDW
-+-
-+-####################################################
-+-
-+-:fast_path
-+-## nop                                                                 ; ldtmu0     # loop counter increment
-+-## shr r0, r4, ra17                                                    ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21         ; mul24 r3, r0, ra0
-+-## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+-## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+-## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+-## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+-## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+-## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+-## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+-## sub r2, r2, r3                                                    ; ldtmu0
-+-##
-+-## mov r0, ra22
-+-## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # apply horizontal filter
-+-##
-+-## asr r2, r2, 15    ; mul24 r3, r0, ra0
-+-## min r2, r2, rb22
-+-## max ra13, r2, 0
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21
-+-## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-+-## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+-## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+-## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+-## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+-## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+-## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+-## sub r0, r2, r3
-+-##
-+-## # apply horizontal filter
-+-##
-+-## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+-## asr r0, r0, 15
-+-## min r0, r0, rb22
-+-## max ra14, r0, 0
-+-##
-+-##
-+-##
-+-##
-+-## nop                                                                 ; ldtmu0     # loop counter increment
-+-## shr r0, r4, ra17                                                    ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21   ; mul24    r3, r0, ra0
-+-## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-+-## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+-## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+-## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+-## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+-## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+-## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+-## sub r0, r2, r3
-+-##
-+-## # apply horizontal filter
-+-##
-+-## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+-## asr r0, r0, 15
-+-## min r0, r0, rb22
-+-## max ra15, r0, 0
-+-
-+-
-+-mov r3, 0  # This signifies the amount of unrolling
-+-
-+-:fast_loop
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+-mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+-
-+-max r2, ra_y, 0
-+-min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
-+-add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+-add t0s, ra_x2_base, r2
-+-
-+-# generate seven shifted versions
-+-# interleave with scroll of vertical context
-+-
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+-sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+-sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+-sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+-sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+-sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+-sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+-sub r0, r2, r3       ; mov r3, rb31
-+-
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 8       ; mov r1, ra22
-+-
-+-# apply horizontal filter
-+-
-+-brr.anyn -, r:fast_loop
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-+-
-+-# apply vertical filter and write to VPM
-+-
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-brr.anyn -, r:fast_loop
-+-asr r1, r1, 15
-+-min r1, r1, rb22
-+-max vpm, r1, 0
-+-
-+-# DMA out
-+-
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+-mov vw_setup, rb29
-+-mov vw_addr, unif # start the VDW
-+ 
-+ ################################################################################
-+ 
-+-# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+-
-+-# At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-+-::mc_filter_b
-++::mc_filter_uv_b
-+ mov ra31, unif
-+ 
-+ # per-channel shifts were calculated on the *previous* invocation
-+ 
-+ mov ra_xshift, ra_xshift_next
-+-mov ra_x2shift, ra_x2shift_next
-+ 
-+ # get base addresses and per-channel shifts for *next* invocation
-+ add r0, unif, elem_num    # x
-+-add r2, r0, 8 # x+8
-+ max r0, r0, 0; mov r1, unif # y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+ shl ra_xshift_next, r0, 3
-+-max r2, r2, 0
-+-min r2, r2, rb_frame_width_minus_1
-+-shl ra_x2shift_next, r2, 3
-++sub r2, unif, r3 # compute offset from frame base u to frame base v
-+ add r0, r0, r3
-+-add r2, r2, r3
-+ and rb_x_base_next, r0, ~3
-+-and ra_x2_base_next, r2, ~3
-+ mov ra_y_next, r1
-++add ra_x2_base_next, rb_x_base_next, r2
-+ 
-+ # set up VPM write
-+ mov vw_setup, rb28
-+@@ -801,17 +375,22 @@ and r0, r0, rb22 # Extract height
-+ add rb17, r0, 5
-+ add rb18, r0, 7
-+ shl r0, r0, 7
-++
-+ # r0 is currently height<<7
-+ # For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+ shl r3, r0, 13
-+ shl r3, r3, 8 # Mask off top 8 bits
-+ shr r3, r3, 8
-++
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb26, r0, rb27
-++
-+ # In a B frame, so also set up VPM read
-+ add vr_setup, r3, rb28
-+ 
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-+ # get filter coefficients
-+ 
-+ mov r0, unif
-+@@ -837,9 +416,13 @@ asr rb12, r0, rb23
-+ 
-+ mov r5rep, -8
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-+ mov r3, 0
-+ 
-+-:bloop
-++:uvloop_b
-+ # retrieve texture results and pick out bytes
-+ # then submit two more texture requests
-+ 
-+@@ -847,7 +430,7 @@ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+ shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+ mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+ 
-+ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+@@ -861,6 +444,7 @@ add t0s, ra_x2_base, r2
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+ mov r2, rb21         ; mul24 r3, r0, ra0
-++nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+ sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+ nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+ sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+@@ -889,7 +473,7 @@ mov ra13, ra14
-+ sub.setf -, r3, 8 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+-brr.anyn -, r:bloop
-++brr.anyn -, r:uvloop_b
-+ max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+ asr r0, r0, 15          ; mov r1, ra21
-+ min.setf ra15, r0, rb22
-+@@ -906,213 +490,50 @@ sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+ sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+ sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 15          ; mov -, vr_wait
-++asr r1, r1, 15
-+ min r1, r1, rb22
-+ add r0, vpm, 1          # Blend in previous VPM contents at this location
-+-brr.anyn -, r:bloop
-++brr.anyn -, r:uvloop_b
-+ max r1, r1, 0
-+ add r1, r1, r0
-+ shr vpm, r1, 1
-+ 
-+-# DMA out
-++
-++# DMA out for U
-++
-++mov vw_setup, rb26 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++# DMA out for V
-++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-++# Could potentially push this write into the start of the next pipeline stage.
-++mov r0, 16
-++mov -, vw_wait
-+ 
-+ bra -, ra31
-+-mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+-mov vw_setup, rb29
-++add vw_setup, rb26, r0 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-+ mov vw_addr, unif # start the VDW
-+ 
-+ ################################################################################
-+ 
-+-# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+-# This filter only does horizontal filtering.
-+-# It is assumed that the region to fetch does not include extra rows above.
-++# mc_exit()
-+ 
-+-# At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-+-::mc_filter_honly
-+-mov ra31, unif
-++::mc_exit
-++mov  -, vw_wait # wait on the VDW
-+ 
-+-# per-channel shifts were calculated on the *previous* invocation
-++mov -,srel(0)
-+ 
-+-mov ra_xshift, ra_xshift_next
-+-mov ra_x2shift, ra_x2shift_next
-+-
-+-# get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num    # x
-+-add r2, r0, 8 # x+8
-+-max r0, r0, 0; mov r1, unif # y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-+-shl ra_xshift_next, r0, 3
-+-max r2, r2, 0
-+-min r2, r2, rb_frame_width_minus_1
-+-shl ra_x2shift_next, r2, 3
-+-add r0, r0, r3
-+-add r2, r2, r3
-+-and rb_x_base_next, r0, ~3
-+-and ra_x2_base_next, r2, ~3
-+-mov ra_y_next, r1
-+-
-+-# set up VPM write
-+-mov vw_setup, rb28
-+-
-+-# get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
-+-shl r0, r0, 7 ; mov rb18,r0
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-+-
-+-# get filter coefficients
-+-
-+-mov r0, unif
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-+-mov r0, unif
-+-
-+-# r2 is elem_num
-+-# r3 is loop counter
-+-mov r5rep, -8
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-+-mov r3, 0
-+-
-+-:loop_honly
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-+-
-+-# generate seven shifted versions
-+-# interleave with scroll of vertical context
-+-
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3       ; mov r3, rb31
-+-
-+-sub.setf -, r3, rb18 ; mov r1, ra22
-+-
-+-mov -, vw_wait   ; mul24 r0, r0, r1
-+-brr.anyn -, r:loop_honly
-+-asr r0, r0, 15          # delay 1
-+-min r0, r0, rb22        # delay 2
-+-max vpm, r0, 0          # delay 3
-+-
-+-# DMA out
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+-mov vw_setup, rb29
-+-mov vw_addr, unif # start the VDW
-+-
-+-
-+-################################################################################
-+-
-+-# mc_exit()
-+-
-+-::mc_exit
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-mov -,srel(0)
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-+ 
-+ nop        ; nop ; thrend
-+ nop        ; nop # delay slot 1
-+ nop        ; nop # delay slot 2
-+ 
-+-::mc_exit1
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-#mov -,srel(1)
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-
-+-nop        ; nop ; thrend
-+-mov interrupt, 1; nop # delay slot 1
-+-nop        ; nop # delay slot 2
-+-
-+-# mc_interrupt_exit()
-+-::mc_interrupt_exit
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-
-+-mov -,sacq(0) # 1
-+-mov -,sacq(0) # 2
-+-mov -,sacq(0) # 3
-+-mov -,sacq(0) # 4
-+-mov -,sacq(0) # 5
-+-mov -,sacq(0) # 6
-+-mov -,sacq(0) # 7
-+-mov -,sacq(0) # 8
-+-mov -,sacq(0) # 9
-+-mov -,sacq(0) # 10
-+-mov -,sacq(0) # 11
-+-
-+-nop        ; nop ; thrend
-+-mov interrupt, 1; nop # delay slot 1
-+-nop        ; nop # delay slot 2
-+-
-+-# mc_interrupt_exit4()
-+-::mc_interrupt_exit4
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-
-+-mov -,sacq(0) # 1
-+-mov -,sacq(0) # 2
-+-mov -,sacq(0) # 3
-+-
-+-nop        ; nop ; thrend
-+-mov interrupt, 1; nop # delay slot 1
-+-nop        ; nop # delay slot 2
-+-
-+ # mc_interrupt_exit8()
-+ ::mc_interrupt_exit8
-+ mov  -, vw_wait # wait on the VDW
-+@@ -1134,282 +555,5 @@ nop        ; nop ; thrend
-+ mov interrupt, 1; nop # delay slot 1
-+ nop        ; nop # delay slot 2
-+ 
-+-################################################################################
-+-# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-+-::mc_setup_uv
-+-
-+-# Read starting kernel
-+-mov ra31, unif
-+-
-+-# Load first request location
-+-add ra_x_base, unif, elem_num # Store x
-+-mov ra_y, unif # Store y
-+-mov ra_x2_base, unif # Store frame u base
-+-nop
-+-sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-+-
-+-# Read image dimensions
-+-sub rb25,unif,1
-+-sub rb30,unif,1
-+-
-+-# get source pitch
-+-mov rb16, unif
-+-
-+-# get destination pitch
-+-mov r0, unif
-+-mov r1, vdw_setup_1(0)
-+-add rb24, r1, r0
-+-
-+-# load constants
-+-
-+-mov ra20, 1
-+-mov ra21, 32
-+-mov ra22, 256
-+-mov ra23, 8
-+-
-+-mov rb20, 0xffffff00
-+-mov rb21, 64
-+-mov rb22, 255
-+-mov rb23, 24
-+-
-+-# touch vertical context to keep simulator happy
-+-
-+-mov ra8, 0
-+-mov ra9, 0
-+-mov ra10, 0
-+-mov ra11, 0
-+-mov ra12, 0
-+-mov ra13, 0
-+-mov ra14, 0
-+-mov ra15, 0
-+-
-+-# Compute part of VPM to use for DMA output
-+-mov r2, qpu_num
-+-and r2, r2, 15
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+-shl r0, r0, 5
-+-add rb27, r0, r1
-+-
-+-# Compute part of VPM to save data into
-+-mov r2, qpu_num
-+-and r2, r2, 15
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-add rb28, r0, r1
-+-
-+-# Compute base address for first and second access
-+-mov r0, ra_x_base           # Load x
-+-max r0, r0, 0; mov r1, ra_y # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-+-shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-add ra_y, r1, 1
-+-add r0, r0, r3
-+-and r0, r0, ~3
-+-max r1, r1, 0 ; mov ra_x_base, r0 # y
-+-min r1, r1, rb_frame_height_minus_1
-+-# submit texture requests for first line
-+-add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-add t0s, r0, r1 ; mov ra_x2_base, r2
-+-add t0s, r2, r1
-+-
-+-# Dump padding words
-+-mov r0, unif
-+-mov r0, unif
-+-mov r0, unif
-+-
-+-# submit texture requests for second line
-+-max r1, ra_y, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1
-+-bra -, ra31
-+-nop ; mul24 r1, r1, rb_pitch
-+-add t0s, r1, ra_x_base
-+-add t0s, r1, ra_x2_base
-+-
-+-
-+-
-+-################################################################################
-+-
-+-::mc_filter_uv_b
-+-mov ra31, unif
-+-
-+-# per-channel shifts were calculated on the *previous* invocation
-+-
-+-mov ra_xshift, ra_xshift_next
-+-
-+-# get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num    # x
-+-max r0, r0, 0; mov r1, unif # y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+-shl ra_xshift_next, r0, 3
-+-sub r2, unif, r3 # compute offset from frame base u to frame base v
-+-add r0, r0, r3
-+-and rb_x_base_next, r0, ~3
-+-mov ra_y_next, r1
-+-add ra_x2_base_next, rb_x_base_next, r2
-+-
-+-# set up VPM write
-+-mov vw_setup, rb28
-+-
-+-# get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-+-shl r0, r0, 7
-+-
-+-# r0 is currently height<<7
-+-# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+-shl r3, r0, 13
-+-shl r3, r3, 8 # Mask off top 8 bits
-+-shr r3, r3, 8
-+-
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-+-
-+-# In a B frame, so also set up VPM read
-+-add vr_setup, r3, rb28
-+-
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+-# get filter coefficients
-+-
-+-mov r0, unif
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-+-asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb12, r0, rb23
-+-
-+-# r2 is elem_num
-+-# r3 is loop counter
-+-
-+-mov r5rep, -8
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-mov r3, 0
-+-
-+-:uvloop_b
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-+-
-+-# generate seven shifted versions
-+-# interleave with scroll of vertical context
-+-
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 8 ; mov r1, ra22
-+-
-+-# apply horizontal filter
-+-brr.anyn -, r:uvloop_b
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-+-
-+-# apply vertical filter and write to VPM
-+-
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 15
-+-min r1, r1, rb22
-+-add r0, vpm, 1          # Blend in previous VPM contents at this location
-+-brr.anyn -, r:uvloop_b
-+-max r1, r1, 0
-+-add r1, r1, r0
-+-shr vpm, r1, 1
-+-
-+-
-+-# DMA out for U
-+-
-+-mov vw_setup, rb26 # VDW setup 0
-+-mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW
-+-
-+-# DMA out for V
-+-# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+-# Could potentially push this write into the start of the next pipeline stage.
-+-mov r0, 16
-+-mov -, vw_wait
-+-
-+-bra -, ra31
-+-add vw_setup, rb26, r0 # VDW setup 0
-+-mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW
-+-
-+ ::mc_end
-++# Do not add code here because mc_end must appear after all other code.
-+-- 
-+2.5.0
-+
-+
-+From f91f17a1cce2b0f6996569ee7cccf0c9768afd87 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 14:54:25 +0100
-+Subject: [PATCH 20/68] Moved chroma P1 to QPUs
-+
-+---
-+ libavcodec/hevc.c | 38 ++++++++++++++++++++++++++++++++++++++
-+ 1 file changed, 38 insertions(+)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 3967361..4dad0e0 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2057,6 +2057,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-++#ifdef RPI_INTER_QPU
-++            if (s->enable_rpi) {
-++                int reflist = 1;
-++                int hshift           = s->ps.sps->hshift[1];
-++                int vshift           = s->ps.sps->vshift[1];
-++                const Mv *mv         = &current_mv.mv[reflist];
-++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-++                intptr_t _mx         = mx << (1 - hshift);
-++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-++
-++                int x1_c = x0_c + (mv->x >> (2 + hshift));
-++                int y1_c = y0_c + (mv->y >> (2 + hshift));
-++                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++                int chan = x0>>8;
-++
-++                uint32_t *u = s->u_mvs[chan & 7];
-++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-++                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-++                      *u++ = rpi_filter_coefs[_mx][0];
-++                      *u++ = rpi_filter_coefs[_mx][1];
-++                      *u++ = rpi_filter_coefs[_my][0];
-++                      *u++ = rpi_filter_coefs[_my][1];
-++                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-++                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++                    }
-++                }
-++                s->u_mvs[chan & 7] = u;
-++                return;
-++            }
-++#endif
-+             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-+                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
-+-- 
-+2.5.0
-+
-+
-+From 36aba6ea897093f6528658e78bf4deeba7eeecd2 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 15:13:47 +0100
-+Subject: [PATCH 21/68] Added B prediction - not quite right
-+
-+---
-+ libavcodec/hevc.c          |  58 ++++++++++++++++++++++++
-+ libavcodec/rpi_shader.c    | 108 +++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |   6 +--
-+ libavcodec/rpi_shader.qasm |  48 ++++++++++----------
-+ 4 files changed, 141 insertions(+), 79 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 4dad0e0..eee617d 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2114,6 +2114,64 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                    ref1->frame, &current_mv.mv[1], &current_mv);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-++#ifdef RPI_INTER_QPU
-++            if (s->enable_rpi) {
-++                int hshift           = s->ps.sps->hshift[1];
-++                int vshift           = s->ps.sps->vshift[1];
-++                const Mv *mv         = &current_mv.mv[0];
-++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-++                intptr_t _mx         = mx << (1 - hshift);
-++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-++                int x1_c = x0_c + (mv->x >> (2 + hshift));
-++                int y1_c = y0_c + (mv->y >> (2 + hshift));
-++
-++                const Mv *mv2         = &current_mv.mv[1];
-++                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
-++                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
-++                intptr_t _mx2         = mx2 << (1 - hshift);
-++                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
-++
-++                int x2_c = x0_c + (mv2->x >> (2 + hshift));
-++                int y2_c = y0_c + (mv2->y >> (2 + hshift));
-++
-++                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++
-++                uint32_t *u = s->u_mvs[chan & 7];
-++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-++                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = rpi_filter_coefs[_mx][0];
-++                      *u++ = rpi_filter_coefs[_mx][1];
-++                      *u++ = rpi_filter_coefs[_my][0];
-++                      *u++ = rpi_filter_coefs[_my][1];
-++                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
-++                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-++                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-++                      *u++ = rpi_filter_coefs[_mx2][0];
-++                      *u++ = rpi_filter_coefs[_mx2][1];
-++                      *u++ = rpi_filter_coefs[_my2][0];
-++                      *u++ = rpi_filter_coefs[_my2][1];
-++                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-++                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++                    }
-++                }
-++                s->u_mvs[chan & 7] = u;
-++                return;
-++            }
-++#endif
-+             RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-+                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
-+ 
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 170e8ac..5d00cb2 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -265,23 +265,23 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+ /* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+ /* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ /* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ /* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ /* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+ /* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+ /* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+ /* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+ /* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+ /* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+ /* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+ /* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+@@ -291,61 +291,63 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+ /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+ /* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+ /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+ /* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 9de4535..e36c4ae 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -6,8 +6,8 @@ extern unsigned int rpi_shader[];
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 142)
-+ #define mc_filter_uv_b (rpi_shader + 360)
-+-#define mc_exit (rpi_shader + 588)
-+-#define mc_interrupt_exit8 (rpi_shader + 606)
-+-#define mc_end (rpi_shader + 636)
-++#define mc_exit (rpi_shader + 592)
-++#define mc_interrupt_exit8 (rpi_shader + 610)
-++#define mc_end (rpi_shader + 640)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index cd7346d..870437d2 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -443,23 +443,23 @@ add t0s, ra_x2_base, r2
-+ 
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+ nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+ nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+ nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+ nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3
-++add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+@@ -474,23 +474,25 @@ sub.setf -, r3, 8 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-++mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr ra15, r0, 8         ; nop
-++nop                     ; nop
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++nop                     ; mul24 r1, ra14, rb14
-++nop                     ; mul24 r0, ra13, rb13
-++add r1, r1, r0          ; mul24 r0, ra12, rb12
-++add r1, r1, r0          ; mul24 r0, ra11, rb11
-++add r1, r1, r0          ; mul24 r0, ra10, rb10
-++add r1, r1, r0          ; mul24 r0, ra9, rb9
-++add r1, r1, r0          ; mul24 r0, ra8, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb15
-++add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 15
-++asr r1, r1, 14
-++add r1, r1, ra21
-++asr r1, r1, 6
-+ min r1, r1, rb22
-+ add r0, vpm, 1          # Blend in previous VPM contents at this location
-+ brr.anyn -, r:uvloop_b
-+-- 
-+2.5.0
-+
-+
-+From 5dec9ecc623e90c6e14b72a34a6bffdd2a005edb Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 08:15:55 +0100
-+Subject: [PATCH 22/68] Added flush for SAO
-+
-+---
-+ libavcodec/hevc.c        |  2 +-
-+ libavcodec/hevc_filter.c | 39 ++++++++++++++++++++++++++-------------
-+ 2 files changed, 27 insertions(+), 14 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index eee617d..58da57d 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2890,7 +2890,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+             rpi_execute_inter_qpu(s);
-+ #endif
-+             // Transform all blocks
-+-            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-++            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 9b6e26d..92a8271 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -871,6 +871,21 @@ static void flush_buffer(AVBufferRef *bref) {
-+     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+     gpu_cache_flush(p);
-+ }
-++
-++static void ff_hevc_flush_chroma(HEVCContext *s)
-++{
-++    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-++            s->nal_unit_type == NAL_TSA_N   ||
-++            s->nal_unit_type == NAL_STSA_N  ||
-++            s->nal_unit_type == NAL_RADL_N  ||
-++            s->nal_unit_type == NAL_RASL_N )) {
-++        flush_buffer(s->frame->buf[1]);
-++        flush_buffer(s->frame->buf[2]);
-++        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-++        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-++        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-++    }
-++}
-+ #endif
-+ 
-+ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+@@ -886,31 +901,29 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x - ctb_size, y);
-+         if (y && x_end) {
-+             sao_filter_CTB(s, x, y - ctb_size);
-+-            if (s->threads_type & FF_THREAD_FRAME )
-++            if (s->threads_type & FF_THREAD_FRAME ) {
-++#ifdef RPI_INTER_QPU
-++                ff_hevc_flush_chroma(s);
-++#endif
-+                 ff_thread_report_progress(&s->ref->tf, y, 0);
-++            }
-+         }
-+         if (x_end && y_end) {
-+             sao_filter_CTB(s, x , y);
-+-            if (s->threads_type & FF_THREAD_FRAME )
-++            if (s->threads_type & FF_THREAD_FRAME ) {
-++#ifdef RPI_INTER_QPU
-++                ff_hevc_flush_chroma(s);
-++#endif
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-++            }
-+         }
-+     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
-+         //int newh = y + ctb_size - 4;
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-+-        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
-+-            s->nal_unit_type == NAL_TSA_N   ||
-+-            s->nal_unit_type == NAL_STSA_N  ||
-+-            s->nal_unit_type == NAL_RADL_N  ||
-+-            s->nal_unit_type == NAL_RASL_N )) {
-+ #ifdef RPI_INTER_QPU
-+-            flush_buffer(s->frame->buf[1]);
-+-            flush_buffer(s->frame->buf[2]);
-++        ff_hevc_flush_chroma(s);
-+ #endif
-+-            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+-            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+-            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+-        }
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+ }
-+-- 
-+2.5.0
-+
-+
-+From 015b41d06a02e23c7937f6c91c4270b2bc2e48c9 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 09:17:28 +0100
-+Subject: [PATCH 23/68] Stopped using acceleration in unsupported cases
-+
-+---
-+ libavcodec/hevc.c       | 14 +++++++-------
-+ libavcodec/hevc_cabac.c |  4 ++--
-+ 2 files changed, 9 insertions(+), 9 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 58da57d..c59ee63 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -1139,15 +1139,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                         for (i = 0; i < (size * size); i++) {
-+                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+                         }
-+-                        printf("Cross component not supported\n"); // TODO
-+-                        exit(-1);
-+                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-+                     }
-+             }
-+ 
-+             if (lc->tu.cross_pf) {
-+-                printf("Cross component not supported\n"); // TODO
-+-                exit(-1);
-+                 hls_cross_component_pred(s, 1);
-+             }
-+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-+@@ -1176,8 +1172,6 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                         for (i = 0; i < (size * size); i++) {
-+                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+                         }
-+-                        printf("Cross component not supported\n"); // TODO
-+-                        exit(-1);
-+                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-+                     }
-+             }
-+@@ -2844,7 +2838,13 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-+ #ifdef RPI
-+-    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-++    s->enable_rpi = s->ps.sps->bit_depth == 8
-++                    && s->ps.sps->width <= RPI_MAX_WIDTH
-++                    && !s->ps.pps->cross_component_prediction_enabled_flag
-++                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
-++                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-++                    && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
-++
-+ #endif
-+ 
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index f28759b..ca76cb0 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1513,9 +1513,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+ #ifdef RPI
-+             if (!use_vpu) {
-+               int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+-              if (max_xy == 0)
-++              if (max_xy == 0) {
-+                   s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-+-              else {
-++              } else {
-+                   int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                   if (max_xy < 4)
-+                       col_limit = FFMIN(4, col_limit);
-+-- 
-+2.5.0
-+
-+
-+From 3b96ec07ff377691a80df9b15de202fcff660599 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 09:42:16 +0100
-+Subject: [PATCH 24/68] Split B prediction into two passes
-+
-+---
-+ libavcodec/hevc.c          |   1 +
-+ libavcodec/hevc.h          |   1 +
-+ libavcodec/rpi_qpu.c       |   3 +
-+ libavcodec/rpi_qpu.h       |   1 +
-+ libavcodec/rpi_shader.c    | 559 +++++++++++++++++++++++++++------------------
-+ libavcodec/rpi_shader.h    |  11 +-
-+ libavcodec/rpi_shader.qasm | 196 ++++++++++++++--
-+ 7 files changed, 531 insertions(+), 241 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index c59ee63..7e82602 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3729,6 +3729,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+             p += uv_commands_per_qpu;
-+         }
-+         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-++        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-+         s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
-+ 
-+     }
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index cae6659..3511982 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -943,6 +943,7 @@ typedef struct HEVCContext {
-+     uint32_t *u_mvs[8];
-+     // Function pointers
-+     uint32_t mc_filter_uv;
-++    uint32_t mc_filter_uv_b0;
-+     uint32_t mc_filter_uv_b;
-+ #endif
-+ 
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 4e90cc1..60bf079 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -636,6 +636,9 @@ unsigned int qpu_get_fn(int num) {
-+     case QPU_MC_FILTER_UV:
-+       fn = mc_filter_uv;
-+       break;
-++    case QPU_MC_FILTER_UV_B0:
-++      fn = mc_filter_uv_b0;
-++      break;
-+     case QPU_MC_FILTER_UV_B:
-+       fn = mc_filter_uv_b;
-+       break;
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index f9ad333..543c84b 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -29,6 +29,7 @@ enum {
-+   QPU_MC_FILTER_HONLY,
-+   QPU_MC_SETUP_UV,
-+   QPU_MC_FILTER_UV,
-++  QPU_MC_FILTER_UV_B0,
-+   QPU_MC_FILTER_UV_B,
-+   QPU_MC_INTERRUPT_EXIT8,
-+   QPU_MC_END
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 5d00cb2..88ad20b 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -39,18 +39,18 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+ /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+ /* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+-/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+ /* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+ /* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+ /* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+@@ -62,176 +62,176 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+ /* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+ /* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-++/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-++/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-++/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_filter_uv_b
-+-/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_uv_b0
-++/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+ /* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+ /* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+@@ -253,7 +253,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+ /* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ /* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :uvloop_b
-++// :uvloop_b0
-+ /* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+ /* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+ /* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+@@ -290,7 +290,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+ /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+ /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+ /* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+ /* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+ /* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+@@ -306,48 +306,163 @@ unsigned int rpi_shader[] = {
-+ /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ /* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+ /* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_uv_b
-++/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :uvloop_b
-++/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index e36c4ae..809e582 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,10 +4,11 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 142)
-+-#define mc_filter_uv_b (rpi_shader + 360)
-+-#define mc_exit (rpi_shader + 592)
-+-#define mc_interrupt_exit8 (rpi_shader + 610)
-+-#define mc_end (rpi_shader + 640)
-++#define mc_filter_uv (rpi_shader + 150)
-++#define mc_filter_uv_b0 (rpi_shader + 368)
-++#define mc_filter_uv_b (rpi_shader + 586)
-++#define mc_exit (rpi_shader + 818)
-++#define mc_interrupt_exit8 (rpi_shader + 836)
-++#define mc_end (rpi_shader + 866)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 870437d2..635b894 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -26,7 +26,7 @@
-+ # ra23                                          8
-+ #
-+ # rb20                                          0xffffff00
-+-# rb21                                          64
-++# rb21                                          vpm_setup for writing 16bit results into VPM
-+ # rb22                                          255
-+ # rb23                                          24
-+ #
-+@@ -34,7 +34,7 @@
-+ # rb25                                          frame width-1
-+ # rb26                                          height<<23 + width<<16 + vdw_setup_0
-+ # rb27                                          vdw_setup_0 (depends on QPU number)
-+-# rb28                                          vpm_setup (depends on QPU number)
-++# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
-+ # rb29                                          vdw_setup_1(dst_pitch-width)
-+ # rb30                                          frame height-1
-+ # rb31                                          used as temp to count loop iterations
-+@@ -69,8 +69,6 @@
-+ .set ra_y_next,                    ra28
-+ .set ra_y,                         ra29
-+ 
-+-.set rb_const_64,                  rb21
-+-
-+ 
-+ ################################################################################
-+ # mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-+@@ -106,7 +104,6 @@ mov ra22, 256
-+ mov ra23, 8
-+ 
-+ mov rb20, 0xffffff00
-+-mov rb21, 64
-+ mov rb22, 255
-+ mov rb23, 24
-+ 
-+@@ -123,6 +120,7 @@ mov ra15, 0
-+ 
-+ # Compute part of VPM to use for DMA output
-+ mov r2, qpu_num
-++shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+ and r2, r2, 15
-+ mov r1, r2
-+ asr r1, r1, 2
-+@@ -135,16 +133,21 @@ shl r0, r0, 5
-+ add rb27, r0, r1
-+ 
-+ # Compute part of VPM to save data into
-+-mov r2, qpu_num
-+-and r2, r2, 15
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vpm_setup(0, 4, h8p(0, 0))
-++mov r2, qpu_num   # qpu_num = abcd
-++shl r2, r2, 1
-++and r2, r2, 15    # r2 = bcd0
-++mov r1, r2        # r1 = bcd0
-++asr r1, r1, 2     # r1 = bc
-++shl r1, r1, 6     # r1 = bc000000
-++mov r0, r2        # r0 = bcd0
-++and r0, r0, 3     # r0 = d0
-++add r0, r0, r1    # r0 = bc0000d0
-++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+ add rb28, r0, r1
-++asr r0, r0, 1     # r0 = bc0000d
-++# Prepare VPM command for 16bit intermediates
-++mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-++add rb21, r0, r1
-+ 
-+ # Compute base address for first and second access
-+ mov r0, ra_x_base           # Load x
-+@@ -345,6 +348,171 @@ mov vw_addr, unif # start the VDW
-+ 
-+ ################################################################################
-+ 
-++# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter_uv_b0
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-++shl ra_xshift_next, r0, 3
-++sub r2, unif, r3 # compute offset from frame base u to frame base v
-++add r0, r0, r3
-++and rb_x_base_next, r0, ~3
-++mov ra_y_next, r1
-++add ra_x2_base_next, rb_x_base_next, r2
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++asr rb12, r0, rb23
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:uvloop_b0
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++nop                  ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++add r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:uvloop_b0
-++mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-++asr ra15, r0, 8         ; nop
-++nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r1, ra14, rb14
-++nop                     ; mul24 r0, ra13, rb13
-++add r1, r1, r0          ; mul24 r0, ra12, rb12
-++add r1, r1, r0          ; mul24 r0, ra11, rb11
-++add r1, r1, r0          ; mul24 r0, ra10, rb10
-++add r1, r1, r0          ; mul24 r0, ra9, rb9
-++add r1, r1, r0          ; mul24 r0, ra8, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb15
-++add r1, r1, r0          ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 14
-++add r1, r1, ra21
-++brr.anyn -, r:uvloop
-++asr r1, r1, 6          # Delay 1
-++min r1, r1, rb22       # Delay 2
-++max vpm, r1, 0         # Delay 3
-++
-++# DMA out for U
-++
-++mov vw_setup, rb26 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++# DMA out for V
-++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-++# Could potentially push this write into the start of the next pipeline stage.
-++mov r0, 16
-++mov -, vw_wait
-++
-++bra -, ra31
-++add vw_setup, rb26, r0 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++################################################################################
-++
-+ ::mc_filter_uv_b
-+ mov ra31, unif
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 6ddd4f127ca17be70a2e60a7b2ff127de89b559c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 10:04:55 +0100
-+Subject: [PATCH 25/68] Switch to using 16bit temp buffers
-+
-+---
-+ libavcodec/hevc.c          |  2 +-
-+ libavcodec/rpi_shader.c    |  4 ++--
-+ libavcodec/rpi_shader.qasm | 10 +++++-----
-+ 3 files changed, 8 insertions(+), 8 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 7e82602..753f85c 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2134,7 +2134,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 88ad20b..ffd3a07 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -220,7 +220,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+ /* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+ /* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+ /* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+ /* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+@@ -346,7 +346,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+ /* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+ /* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+ /* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+ /* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 635b894..9577121 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -26,7 +26,7 @@
-+ # ra23                                          8
-+ #
-+ # rb20                                          0xffffff00
-+-# rb21                                          vpm_setup for writing 16bit results into VPM
-++# rb21                                          vpm_setup for reading/writing 16bit results into VPM
-+ # rb22                                          255
-+ # rb23                                          24
-+ #
-+@@ -370,8 +370,8 @@ and rb_x_base_next, r0, ~3
-+ mov ra_y_next, r1
-+ add ra_x2_base_next, rb_x_base_next, r2
-+ 
-+-# set up VPM write
-+-mov vw_setup, rb28
-++# set up VPM write, we need to save 16bit precision
-++mov vw_setup, rb21
-+ 
-+ # get width,height of block
-+ mov r2, 16
-+@@ -554,8 +554,8 @@ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb26, r0, rb27
-+ 
-+-# In a B frame, so also set up VPM read
-+-add vr_setup, r3, rb28
-++# In a B frame, so also set up VPM read (reading back 16bit precision)
-++add vr_setup, r3, rb21
-+ 
-+ sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+ 
-+-- 
-+2.5.0
-+
-+
-+From b516e30ff4a9354497d3b6ecee77bfaeb69ca4d6 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 10:30:44 +0100
-+Subject: [PATCH 26/68] Corrected B prediction: matching md5 sum for hobbit50
-+
-+---
-+ libavcodec/rpi_shader.c    | 815 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  12 +-
-+ libavcodec/rpi_shader.qasm |  36 +-
-+ 3 files changed, 429 insertions(+), 434 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index ffd3a07..77cca46 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -38,431 +38,428 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+ /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+ /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+-/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+-/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+-/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-++/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-++/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+ /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-++/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 809e582..6562fa9 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,11 +4,11 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 150)
-+-#define mc_filter_uv_b0 (rpi_shader + 368)
-+-#define mc_filter_uv_b (rpi_shader + 586)
-+-#define mc_exit (rpi_shader + 818)
-+-#define mc_interrupt_exit8 (rpi_shader + 836)
-+-#define mc_end (rpi_shader + 866)
-++#define mc_filter_uv (rpi_shader + 152)
-++#define mc_filter_uv_b0 (rpi_shader + 370)
-++#define mc_filter_uv_b (rpi_shader + 584)
-++#define mc_exit (rpi_shader + 812)
-++#define mc_interrupt_exit8 (rpi_shader + 830)
-++#define mc_end (rpi_shader + 860)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 9577121..562dc35 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -39,13 +39,13 @@
-+ # rb30                                          frame height-1
-+ # rb31                                          used as temp to count loop iterations
-+ #
-+-# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
-+ # ra24                                          clipped(row start address+8+elem_num)&~3
-+ # ra25                                          per-channel shifts 2
-+ # ra26                                          next ra24
-+ # ra27                                          next ra25
-+ # ra28                                          next y
-+ # ra29                                          y for next texture access
-++# ra30                                          64
-+ #
-+ # ra31                                          next kernel address
-+ 
-+@@ -102,6 +102,7 @@ mov ra20, 1
-+ mov ra21, 32
-+ mov ra22, 256
-+ mov ra23, 8
-++mov ra30, 64
-+ 
-+ mov rb20, 0xffffff00
-+ mov rb22, 255
-+@@ -472,7 +473,7 @@ sub.setf -, r3, 8 ; mov r1, ra22
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b0
-+ mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-+-asr ra15, r0, 8         ; nop
-++asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
-+ nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-+ 
-+ # apply vertical filter and write to VPM
-+@@ -487,18 +488,18 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
-+ add r1, r1, r0          ; mul24 r0, ra15, rb15
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 14
-+-add r1, r1, ra21
-+-brr.anyn -, r:uvloop
-+-asr r1, r1, 6          # Delay 1
-+-min r1, r1, rb22       # Delay 2
-+-max vpm, r1, 0         # Delay 3
-++#asr r1, r1, 14
-++#add r1, r1, ra21
-++brr.anyn -, r:uvloop_b0
-++asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
-++nop                    # Delay 2
-++nop                    # Delay 3
-+ 
-+ # DMA out for U
-+ 
-+ mov vw_setup, rb26 # VDW setup 0
-+ mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW
-++mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
-+ 
-+ # DMA out for V
-+ # We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+@@ -639,12 +640,11 @@ mov ra12, ra13
-+ mov ra13, ra14
-+ 
-+ sub.setf -, r3, 8 ; mov r1, ra22
-+-
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b
-+ mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+ asr ra15, r0, 8         ; nop
-+-nop                     ; nop
-++nop                     ; nop    # TODO improve use of delay slots
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+@@ -658,15 +658,13 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
-+ add r1, r1, r0          ; mul24 r0, ra15, rb15
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 14
-+-add r1, r1, ra21
-+-asr r1, r1, 6
-+-min r1, r1, rb22
-+-add r0, vpm, 1          # Blend in previous VPM contents at this location
-++asr r1, r1, 14          # shift2=6
-++add r1, r1, vpm         # Blend in previous VPM contents at this location
-++add r1, r1, ra30
-+ brr.anyn -, r:uvloop_b
-+-max r1, r1, 0
-+-add r1, r1, r0
-+-shr vpm, r1, 1
-++asr r1, r1, 7           # Delay 1
-++min r1, r1, rb22        # Delay 2
-++max vpm, r1, 0          # Delay 3
-+ 
-+ 
-+ # DMA out for U
-+-- 
-+2.5.0
-+
-+
-+From 5a589f03af71ff87e50d46520ed652571357c9cc Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 10:55:07 +0100
-+Subject: [PATCH 27/68] P prediction uses 4 tap filters
-+
-+---
-+ libavcodec/hevc.c          |  50 ++--
-+ libavcodec/rpi_shader.c    | 631 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  10 +-
-+ libavcodec/rpi_shader.qasm |  43 +--
-+ 4 files changed, 344 insertions(+), 390 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 753f85c..16f2200 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -63,15 +63,15 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+ // TODO Chroma only needs 4 taps
-+-static uint32_t rpi_filter_coefs[8][2] = {
-+-        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
-++static uint32_t rpi_filter_coefs[8][1] = {
-++        { ENCODE_COEFFS(   0,  64,   0,   0) },
-++        { ENCODE_COEFFS(  -2,  58,  10,  -2) },
-++        { ENCODE_COEFFS(  -4,  54,  16,  -2) },
-++        { ENCODE_COEFFS(  -6,  46,  28,  -4) },
-++        { ENCODE_COEFFS(  -4,  36,  36,  -4) },
-++        { ENCODE_COEFFS(  -4,  28,  46,  -6) },
-++        { ENCODE_COEFFS(  -2,  16,  54,  -4) },
-++        { ENCODE_COEFFS(  -2,  10,  58,  -2) }
-+ };
-+ 
-+ static uint32_t get_vc_address(AVBufferRef *bref) {
-+@@ -2014,16 +2014,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      *u++ = rpi_filter_coefs[_mx][1];
-++                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      *u++ = rpi_filter_coefs[_my][1];
-++                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2071,16 +2071,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      *u++ = rpi_filter_coefs[_mx][1];
-++                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      *u++ = rpi_filter_coefs[_my][1];
-++                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2135,29 +2135,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      *u++ = rpi_filter_coefs[_mx][1];
-++                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      *u++ = rpi_filter_coefs[_my][1];
-++                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+ 
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx2][0];
-+-                      *u++ = rpi_filter_coefs[_mx2][1];
-++                      u++;
-+                       *u++ = rpi_filter_coefs[_my2][0];
-+-                      *u++ = rpi_filter_coefs[_my2][1];
-++                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 77cca46..c8d0728 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -116,8 +116,8 @@ unsigned int rpi_shader[] = {
-+ /* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+ /* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+ /* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+ /* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+ /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+ /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+@@ -128,338 +128,315 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ /* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ /* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+-/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-++/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 6562fa9..1bf7a68 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 370)
-+-#define mc_filter_uv_b (rpi_shader + 584)
-+-#define mc_exit (rpi_shader + 812)
-+-#define mc_interrupt_exit8 (rpi_shader + 830)
-+-#define mc_end (rpi_shader + 860)
-++#define mc_filter_uv_b0 (rpi_shader + 324)
-++#define mc_filter_uv_b (rpi_shader + 538)
-++#define mc_exit (rpi_shader + 766)
-++#define mc_interrupt_exit8 (rpi_shader + 784)
-++#define mc_end (rpi_shader + 814)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 562dc35..8e4f18f 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -16,8 +16,8 @@
-+ # ra19                                          next ra17
-+ #
-+ # rb16                                          pitch
-+-# rb17                                          height + 5
-+-# rb18                                          height + 7
-++# rb17                                          height + 1
-++# rb18                                          height + 3
-+ # rb19                                          next ra16
-+ #
-+ # ra20                                          1
-+@@ -214,8 +214,8 @@ mov r0, unif
-+ shr r1, r0, r2 # Extract width
-+ sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+ and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-++add rb17, r0, 1
-++add rb18, r0, 3
-+ shl r0, r0, 7
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+@@ -230,18 +230,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-++                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb12, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -283,26 +276,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+ add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+ mov ra12, ra13
-+ mov ra13, ra14
-+ 
-+-sub.setf -, r3, 8 ; mov r1, ra22
-++sub.setf -, r3, 4 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop
-+@@ -312,14 +293,10 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r1, ra14, rb14
-+-nop                     ; mul24 r0, ra13, rb13
-+-add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb15
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ asr r1, r1, 14
-+-- 
-+2.5.0
-+
-+
-+From b267b33e74268586aacdcc31ca02c35aba69a230 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:03:51 +0100
-+Subject: [PATCH 28/68] Optimised B0 pass
-+
-+---
-+ libavcodec/rpi_shader.c    | 424 +++++++++++++++++++++------------------------
-+ libavcodec/rpi_shader.h    |   8 +-
-+ libavcodec/rpi_shader.qasm |  43 +----
-+ 3 files changed, 212 insertions(+), 263 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index c8d0728..1f63ee0 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -204,239 +204,215 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+ /* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+ /* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+ /* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+ /* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+ /* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+ /* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+-/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-++/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 1bf7a68..cb74887 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+ #define mc_filter_uv_b0 (rpi_shader + 324)
-+-#define mc_filter_uv_b (rpi_shader + 538)
-+-#define mc_exit (rpi_shader + 766)
-+-#define mc_interrupt_exit8 (rpi_shader + 784)
-+-#define mc_end (rpi_shader + 814)
-++#define mc_filter_uv_b (rpi_shader + 490)
-++#define mc_exit (rpi_shader + 718)
-++#define mc_interrupt_exit8 (rpi_shader + 736)
-++#define mc_end (rpi_shader + 766)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 8e4f18f..faa5755 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -357,15 +357,13 @@ mov r0, unif
-+ shr r1, r0, r2 # Extract width
-+ sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+ and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-++add rb17, r0, 1
-++add rb18, r0, 3
-+ shl r0, r0, 7
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb26, r0, rb27
-+ 
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+ # get filter coefficients
-+ 
-+ mov r0, unif
-+@@ -373,18 +371,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-++                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb12, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -426,26 +417,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+ add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+ mov ra12, ra13
-+ mov ra13, ra14
-+ 
-+-sub.setf -, r3, 8 ; mov r1, ra22
-++sub.setf -, r3, 4 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b0
-+@@ -455,18 +434,12 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r1, ra14, rb14
-+-nop                     ; mul24 r0, ra13, rb13
-+-add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb15
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-#asr r1, r1, 14
-+-#add r1, r1, ra21
-+ brr.anyn -, r:uvloop_b0
-+ asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
-+ nop                    # Delay 2
-+-- 
-+2.5.0
-+
-+
-+From 7941c95bd5e968d6e1ea0462cb27c475aa4ee5e1 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:12:43 +0100
-+Subject: [PATCH 29/68] Optimised B pass
-+
-+---
-+ libavcodec/rpi_shader.c    | 202 ++++++++++++++++++++-------------------------
-+ libavcodec/rpi_shader.h    |   6 +-
-+ libavcodec/rpi_shader.qasm |  41 ++-------
-+ 3 files changed, 100 insertions(+), 149 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 1f63ee0..4e6c5ea 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -289,8 +289,8 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+ /* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+ /* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+ /* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+ /* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+ /* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+@@ -299,120 +299,96 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+ /* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+ /* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index cb74887..53da629 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -7,8 +7,8 @@ extern unsigned int rpi_shader[];
-+ #define mc_filter_uv (rpi_shader + 152)
-+ #define mc_filter_uv_b0 (rpi_shader + 324)
-+ #define mc_filter_uv_b (rpi_shader + 490)
-+-#define mc_exit (rpi_shader + 718)
-+-#define mc_interrupt_exit8 (rpi_shader + 736)
-+-#define mc_end (rpi_shader + 766)
-++#define mc_exit (rpi_shader + 670)
-++#define mc_interrupt_exit8 (rpi_shader + 688)
-++#define mc_end (rpi_shader + 718)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index faa5755..f38c926 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -491,8 +491,8 @@ mov r0, unif
-+ shr r1, r0, r2 # Extract width
-+ sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+ and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-++add rb17, r0, 1
-++add rb18, r0, 3
-+ shl r0, r0, 7
-+ 
-+ # r0 is currently height<<7
-+@@ -508,8 +508,6 @@ add rb26, r0, rb27
-+ # In a B frame, so also set up VPM read (reading back 16bit precision)
-+ add vr_setup, r3, rb21
-+ 
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+ # get filter coefficients
-+ 
-+ mov r0, unif
-+@@ -517,18 +515,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-++                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb12, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -570,26 +561,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+ add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+ mov ra12, ra13
-+ mov ra13, ra14
-+ 
-+-sub.setf -, r3, 8 ; mov r1, ra22
-++sub.setf -, r3, 4 ; mov r1, ra22
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b
-+ mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+@@ -598,14 +577,10 @@ nop                     ; nop    # TODO improve use of delay slots
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r1, ra14, rb14
-+-nop                     ; mul24 r0, ra13, rb13
-+-add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb15
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ asr r1, r1, 14          # shift2=6
-+-- 
-+2.5.0
-+
-+
-+From 3c9637fbe9311db205c5e3a1ab178771dab22856 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:17:09 +0100
-+Subject: [PATCH 30/68] Used P delay slots more efficiently
-+
-+---
-+ libavcodec/rpi_shader.c    | 437 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  10 +-
-+ libavcodec/rpi_shader.qasm |  19 +-
-+ 3 files changed, 228 insertions(+), 238 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 4e6c5ea..a1af4e3 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -156,239 +156,236 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ /* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ /* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+ /* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+-/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-++/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 53da629..1fb3e37 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 324)
-+-#define mc_filter_uv_b (rpi_shader + 490)
-+-#define mc_exit (rpi_shader + 670)
-+-#define mc_interrupt_exit8 (rpi_shader + 688)
-+-#define mc_end (rpi_shader + 718)
-++#define mc_filter_uv_b0 (rpi_shader + 318)
-++#define mc_filter_uv_b (rpi_shader + 484)
-++#define mc_exit (rpi_shader + 664)
-++#define mc_interrupt_exit8 (rpi_shader + 682)
-++#define mc_end (rpi_shader + 712)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index f38c926..02e95dd 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -268,6 +268,7 @@ add t0s, ra_x2_base, r2
-+ 
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-++# apply horizontal filter
-+ nop                  ; mul24 r2, r0, ra0
-+ nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+ nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+@@ -276,20 +277,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 4 ; mov r1, ra22
-+-
-+-# apply horizontal filter
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 4    ; mov ra12, ra13
-+ brr.anyn -, r:uvloop
-+-mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-+-asr ra15, r0, 8         ; nop
-+-nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 07f733af90de9d5823f62c0b7276bb1c7187ec6f Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:22:25 +0100
-+Subject: [PATCH 31/68] Improved use of delay slots
-+
-+---
-+ libavcodec/rpi_shader.c    | 503 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  10 +-
-+ libavcodec/rpi_shader.qasm |  41 ++--
-+ 3 files changed, 265 insertions(+), 289 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index a1af4e3..c498f28 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -122,270 +122,263 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+ /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+ /* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+-/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit8
-++/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_interrupt_exit8
-+-/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 1fb3e37..3fac45f 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 318)
-+-#define mc_filter_uv_b (rpi_shader + 484)
-+-#define mc_exit (rpi_shader + 664)
-+-#define mc_interrupt_exit8 (rpi_shader + 682)
-+-#define mc_end (rpi_shader + 712)
-++#define mc_filter_uv_b0 (rpi_shader + 316)
-++#define mc_filter_uv_b (rpi_shader + 476)
-++#define mc_exit (rpi_shader + 650)
-++#define mc_interrupt_exit8 (rpi_shader + 668)
-++#define mc_end (rpi_shader + 698)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 02e95dd..10f5113 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -221,8 +221,6 @@ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb26, r0, rb27
-+ 
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+ # get filter coefficients
-+ 
-+ mov r0, unif
-+@@ -410,20 +408,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 4 ; mov r1, ra22
-+-
-+-# apply horizontal filter
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 4    ; mov ra12, ra13
-+ brr.anyn -, r:uvloop_b0
-+-mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-+-asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
-+-nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+@@ -432,9 +422,9 @@ nop                     ; mul24 r0, ra13, rb9
-+ add r1, r1, r0          ; mul24 r0, ra12, rb8
-+ add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++sub.setf -, r3, rb18
-+ brr.anyn -, r:uvloop_b0
-+-asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
-++asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still in 16bit precision
-+ nop                    # Delay 2
-+ nop                    # Delay 3
-+ 
-+@@ -554,19 +544,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 4 ; mov r1, ra22
-+-# apply horizontal filter
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 4    ; mov ra12, ra13
-+ brr.anyn -, r:uvloop_b
-+-mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr ra15, r0, 8         ; nop
-+-nop                     ; nop    # TODO improve use of delay slots
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 29956c5549eb94e418c42e838d0bfceeb95730b0 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:31:23 +0100
-+Subject: [PATCH 32/68] Avoid writeback of first B results
-+
-+---
-+ libavcodec/rpi_shader.c    | 229 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |   8 +-
-+ libavcodec/rpi_shader.qasm |  18 +---
-+ 3 files changed, 121 insertions(+), 134 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index c498f28..ba453a2 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -255,130 +255,125 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+ /* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-+ /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+ /* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 3fac45f..45dbe0e 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+ #define mc_filter_uv_b0 (rpi_shader + 316)
-+-#define mc_filter_uv_b (rpi_shader + 476)
-+-#define mc_exit (rpi_shader + 650)
-+-#define mc_interrupt_exit8 (rpi_shader + 668)
-+-#define mc_end (rpi_shader + 698)
-++#define mc_filter_uv_b (rpi_shader + 466)
-++#define mc_exit (rpi_shader + 640)
-++#define mc_interrupt_exit8 (rpi_shader + 658)
-++#define mc_end (rpi_shader + 688)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 10f5113..e138c95 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -428,22 +428,14 @@ asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still
-+ nop                    # Delay 2
-+ nop                    # Delay 3
-+ 
-++# in pass0 we don't really need to save any results, but need to discard the uniforms
-+ # DMA out for U
-+ 
-+-mov vw_setup, rb26 # VDW setup 0
-+-mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
-+-
-+-# DMA out for V
-+-# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+-# Could potentially push this write into the start of the next pipeline stage.
-+-mov r0, 16
-+-mov -, vw_wait
-+-
-+ bra -, ra31
-+-add vw_setup, rb26, r0 # VDW setup 0
-+-mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW
-++mov r0, unif           # Delay 1
-++mov r0, unif           # Delay 2
-++nop                    # Delay 3
-++
-+ 
-+ ################################################################################
-+ 
-+-- 
-+2.5.0
-+
-+
-+From c184ce179f16ca497ed003805193651fa3b30817 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:36:24 +0100
-+Subject: [PATCH 33/68] Cutdown size of chroma prediction commands
-+
-+---
-+ libavcodec/hevc.c          |  17 +-
-+ libavcodec/rpi_shader.c    | 543 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  12 +-
-+ libavcodec/rpi_shader.qasm |  11 +-
-+ 4 files changed, 281 insertions(+), 302 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 16f2200..da81a54 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -54,7 +54,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ 
-+ #ifdef RPI_INTER_QPU
-+ 
-+-#define RPI_CHROMA_COMMAND_WORDS 12
-++#define RPI_CHROMA_COMMAND_WORDS 10
-+ #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+@@ -2019,11 +2019,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+-                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2078,9 +2075,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2141,11 +2136,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      u++;
-+-                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
-+-                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++                      u+=2; // Intermediate results are not written back in first pass of B filtering
-+ 
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-+@@ -2153,11 +2145,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+-                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx2][0];
-+-                      u++;
-+                       *u++ = rpi_filter_coefs[_my2][0];
-+-                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2795,7 +2784,7 @@ static void rpi_inter_clear(HEVCContext *s)
-+         *s->u_mvs[i]++ = pic_height;
-+         *s->u_mvs[i]++ = s->frame->linesize[1];
-+         *s->u_mvs[i]++ = s->frame->linesize[2];
-+-        s->u_mvs[i] += 3;  // Padding words
-++        s->u_mvs[i] += 1;  // Padding words
-+     }
-+ }
-+ 
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index ba453a2..b0b93b5 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -89,291 +89,286 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+ /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+ /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 45dbe0e..99927c4 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,11 +4,11 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 316)
-+-#define mc_filter_uv_b (rpi_shader + 466)
-+-#define mc_exit (rpi_shader + 640)
-+-#define mc_interrupt_exit8 (rpi_shader + 658)
-+-#define mc_end (rpi_shader + 688)
-++#define mc_filter_uv (rpi_shader + 148)
-++#define mc_filter_uv_b0 (rpi_shader + 310)
-++#define mc_filter_uv_b (rpi_shader + 458)
-++#define mc_exit (rpi_shader + 630)
-++#define mc_interrupt_exit8 (rpi_shader + 648)
-++#define mc_end (rpi_shader + 678)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index e138c95..d9ffcda 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -167,8 +167,6 @@ add t0s, r2, r1
-+ 
-+ # Dump padding words
-+ mov r0, unif
-+-mov r0, unif
-+-mov r0, unif
-+ 
-+ # submit texture requests for second line
-+ max r1, ra_y, 0
-+@@ -228,11 +226,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-++asr rb8, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -362,11 +359,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-++asr rb8, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -490,11 +486,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-++asr rb8, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+-- 
-+2.5.0
-+
-+
-+From 5edce4e2a69b82aceb72f331737b5b00bf3af912 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 15:21:49 +0100
-+Subject: [PATCH 34/68] hevc: don't redirect when not rpi_enabled
-+
-+---
-+ libavcodec/hevc.c | 2 +-
-+ 1 file changed, 1 insertion(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index da81a54..60b3d97 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -1455,7 +1455,7 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-+  */
-+ 
-+ #ifdef RPI_INTER
-+-#define RPI_REDIRECT(fn) rpi_ ## fn
-++#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
-+ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                         int block_w, int block_h, int luma_weight, int luma_offset)
-+-- 
-+2.5.0
-+
-+
-+From 86652e6a111a593a8c14c8eecaa7e26a068febcf Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 15:22:02 +0100
-+Subject: [PATCH 35/68] Use /dev/vcio for mailbox access
-+
-+---
-+ libavcodec/rpi_mailbox.c | 2 +-
-+ 1 file changed, 1 insertion(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-+index 536896f..77a56dd 100644
-+--- a/libavcodec/rpi_mailbox.c
-++++ b/libavcodec/rpi_mailbox.c
-+@@ -39,7 +39,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ 
-+ #define MAJOR_NUM 100
-+ #define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+-#define DEVICE_FILE_NAME "/dev/char_dev"
-++#define DEVICE_FILE_NAME "/dev/vcio"
-+ 
-+ #include "rpi_mailbox.h"
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 05eb83c2f257c17a02abc01a6be6ae9df2d8e653 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 15:25:25 +0100
-+Subject: [PATCH 36/68] Use vcsm for all memory allocations
-+
-+---
-+ libavcodec/rpi_qpu.c | 174 +++++++++++++++++++--------------------------------
-+ 1 file changed, 64 insertions(+), 110 deletions(-)
-+
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 60bf079..f62051f 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -1,7 +1,5 @@
-+ #ifdef RPI
-+-// define RPI_USE_VCSM to use the vcsm device for shared memory
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+-#define RPI_USE_VCSM
-+ // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+ #define RPI_TIME_TOTAL_QPU
-+ // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+@@ -25,9 +23,7 @@
-+ #include "rpi_shader.h"
-+ #include "rpi_hevc_transform.h"
-+ 
-+-#ifdef RPI_USE_VCSM
-+ #include "rpi_user_vcsm.h"
-+-#endif
-+ 
-+ // On Pi2 there is no way to access the VPU L2 cache
-+ // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+@@ -96,7 +92,6 @@ struct GPU
-+   unsigned int vpu_code[VPU_CODE_SIZE];
-+   short transMatrix2even[16*16*2];
-+   int open_count; // Number of allocated video buffers
-+-  unsigned int vc_handle; // Handle of this memory
-+   int      mb; // Mailbox handle
-+   int      vc; // Address in GPU memory
-+   int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-+@@ -105,6 +100,7 @@ struct GPU
-+ // Stop more than one thread trying to allocate memory or use the processing resources at once
-+ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ static volatile struct GPU* gpu = NULL;
-++static GPU_MEM_PTR_T gpu_mem_ptr;
-+ 
-+ #if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
-+ static unsigned int Microseconds(void) {
-+@@ -132,39 +128,27 @@ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-+ static volatile int vpu_async_head=0;
-+ #endif
-+ 
-++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
-++static void gpu_free_internal(GPU_MEM_PTR_T *p);
-++
-+ // Connect to QPU, returns 0 on success.
-+ static int gpu_init(volatile struct GPU **gpu) {
-+   int mb = mbox_open();
-+   int vc;
-+-  int handle;
-+   volatile struct GPU* ptr;
-+ 	if (mb < 0)
-+ 		return -1;
-+ 
-+ 	if (qpu_enable(mb, 1)) return -2;
-+ 
-+-#ifdef RPI_USE_VCSM
-+   vcsm_init();
-+-#endif
-++  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-++  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-++  memset(ptr, 0, sizeof *ptr);
-++  vc = gpu_mem_ptr.vc;
-+ 
-+-  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
-+-  if (!handle)
-+-  {
-+-    qpu_enable(mb, 0);
-+-    return -3;
-+-  }
-+-	vc = mem_lock(mb, handle);
-+-	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
-+-	if (ptr == NULL)
-+-	{	mem_free(mb, handle);
-+-		mem_unlock(mb, handle);
-+-		qpu_enable(mb, 0);
-+-		return -4;
-+-	}
-+-
-+-	ptr->mb = mb;
-+-	ptr->vc_handle = handle;
-+-	ptr->vc = vc;
-++  ptr->mb = mb;
-++  ptr->vc = vc;
-+ 
-+   printf("GPU allocated at 0x%x\n",vc);
-+ 
-+@@ -226,94 +210,74 @@ static void gpu_unlock(void) {
-+   pthread_mutex_unlock(&gpu_mutex);
-+ }
-+ 
-++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-++  assert(p->vcsm_handle);
-++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-++  assert(p->vc_handle);
-++  p->arm = vcsm_lock(p->vcsm_handle);
-++  assert(p->arm);
-++  p->vc = mem_lock(mb, p->vc_handle);
-++  assert(p->vc);
-++  return 0;
-++}
-++
-+ // Allocate memory on GPU
-+ // Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-+ // Returns 0 on success.
-+ // This allocates memory that will not be cached in ARM's data cache.
-+ // Therefore safe to use without data cache flushing.
-+-int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
-++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-++{
-++  int r;
-+   gpu_lock();
-+-  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-+-  p->vcsm_handle = 0;
-+-  if (!p->vc_handle)
-+-  {
-+-    qpu_enable(gpu->mb, 0);
-+-    return -3;
-+-  }
-+-  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+-  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-+-  p->numbytes = numbytes;
-+-  if (p->arm == NULL)
-+-  {
-+-    mem_free(gpu->mb, p->vc_handle);
-+-    mem_unlock(gpu->mb, p->vc_handle);
-+-    gpu_unlock();
-+-    qpu_enable(gpu->mb, 0);
-+-    return -4;
-+-  }
-++  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
-+   gpu->open_count++;
-+   gpu_unlock();
-+-  return 0;
-++  return r;
-+ }
-+ 
-+ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ {
-+-  // This only works when using RPI_USE_VCSM
-+   void *tmp = vcsm_lock(p->vcsm_handle);
-+   vcsm_unlock_ptr(tmp);
-+ }
-+ 
-++static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-++  assert(p->vcsm_handle);
-++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-++  assert(p->vc_handle);
-++  p->arm = vcsm_lock(p->vcsm_handle);
-++  assert(p->arm);
-++  p->vc = mem_lock(gpu->mb, p->vc_handle);
-++  assert(p->vc);
-++  return 0;
-++}
-++
-+ // This allocates data that will be
-+ //    Cached in ARM L2
-+ //    Uncached in VPU L2
-+-int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
-++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-++{
-++  int r;
-+   gpu_lock();
-+-#ifdef RPI_USE_VCSM
-+-  {
-+-      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
-+-      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
-+-      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
-+-      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
-+-      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+-      p->arm = vcsm_lock(p->vcsm_handle);
-+-      p->vc = mem_lock(gpu->mb, p->vc_handle);
-+-  }
-+-#else
-+-  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-+-  p->vcsm_handle = 0;
-+-  if (!p->handle)
-+-  {
-+-    qpu_enable(gpu->mb, 0);
-+-    return -3;
-+-  }
-+-  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+-  printf("This mapmem_private does not seem to work\n");
-+-  exit(-1);
-+-  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-+-  p->numbytes = numbytes;
-+-  if (p->arm == NULL)
-+-  {
-+-    mem_free(gpu->mb, p->handle);
-+-    mem_unlock(gpu->mb, p->handle);
-+-    gpu_unlock();
-+-    qpu_enable(gpu->mb, 0);
-+-    return -4;
-+-  }
-+-#endif
-++  r = gpu_malloc_cached_internal(numbytes, p);
-+   gpu->open_count++;
-+   gpu_unlock();
-+-  return 0;
-++  return r;
-+ }
-+ 
-+ static void gpu_term(void)
-+ {
-+-	int mb;
-+-	unsigned handle;
-++  int mb;
-+ 
-+   if (gpu==NULL)
-+     return;
-+   mb = gpu->mb;
-+-  handle = gpu->vc_handle;
-+ 
-+ #ifdef RPI_ASYNC
-+   {
-+@@ -323,37 +287,26 @@ static void gpu_term(void)
-+   }
-+ #endif
-+ 
-++  qpu_enable(mb, 0);
-++  gpu_free_internal(&gpu_mem_ptr);
-+ 
-+-	unmapmem((void*)gpu, sizeof(struct GPU));
-+-	mem_unlock(mb, handle);
-+-	mem_free(mb, handle);
-+-	qpu_enable(mb, 0);
-+-#ifdef RPI_USE_VCSM
-+   vcsm_exit();
-+-#endif
-+-	mbox_close(mb);
-++
-++  mbox_close(mb);
-+   gpu = NULL;
-+ }
-+ 
-+-void gpu_free(GPU_MEM_PTR_T *p) {
-++void gpu_free_internal(GPU_MEM_PTR_T *p) {
-+   int mb = gpu->mb;
-+-	unsigned handle = p->vc_handle;
-++  mem_unlock(mb,p->vc_handle);
-++  vcsm_unlock_ptr(p->arm);
-++  vcsm_free(p->vcsm_handle);
-++}
-++
-++void gpu_free(GPU_MEM_PTR_T *p) {
-+   gpu_lock();
-+-#ifdef RPI_USE_VCSM
-+-  if (p->vcsm_handle) {
-+-      mem_unlock(mb,p->vc_handle);
-+-      vcsm_unlock_ptr(p->arm);
-+-      vcsm_free(p->vcsm_handle);
-+-  } else {
-+-	unmapmem((void*)p->arm, sizeof(struct GPU));
-+-      mem_unlock(mb, handle);
-+-      mem_free(mb, handle);
-+-  }
-+-#else
-+-	unmapmem((void*)p->arm, sizeof(struct GPU));
-+-	mem_unlock(mb, handle);
-+-	mem_free(mb, handle);
-+-#endif
-++
-++  gpu_free_internal(p);
-+ 
-+   gpu->open_count--;
-+   if (gpu->open_count==0) {
-+@@ -386,20 +339,21 @@ unsigned int vpu_get_constants(void) {
-+ 
-+ static void *vpu_start(void *arg) {
-+   while(1) {
-++    int *p;
-+     pthread_mutex_lock(&post_mutex);
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
-+       pthread_cond_wait(&post_cond_tail, &post_mutex);
-+     }
-+-    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-++    p = vpu_cmds[vpu_async_head%MAXCMDS];
-+     pthread_mutex_unlock(&post_mutex);
-+ 
-+     if (p[6] == -1) {
-+       break; // Last job
-+     }
-+     if (p[7]) {
-+-        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-++        //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+         //gpu_cache_flush(buf);
-+     }
-+     vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+-- 
-+2.5.0
-+
-+
-+From 72b441dc9a9965ce3e5812be87081ffae1e166de Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 15:43:17 +0100
-+Subject: [PATCH 37/68] Enable EARLY_MALLOC and fix sps access bug
-+
-+---
-+ libavcodec/hevc.c | 5 +++--
-+ 1 file changed, 3 insertions(+), 2 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 60b3d97..eee22eb 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -42,7 +42,7 @@
-+ #ifdef RPI
-+   #include "rpi_qpu.h"
-+   // For some unknown reason, the code seems to crash if I do a late malloc
-+-  #define EARLY_MALLOC
-++  //#define EARLY_MALLOC
-+   // Move Inter prediction into separate pass
-+   #define RPI_INTER
-+ #endif
-+@@ -147,7 +147,8 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+ #ifdef RPI
-+ #ifdef EARLY_MALLOC
-+ #else
-+-    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
-++    assert(sps);
-++    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-+     printf("pic_arrays_init\n");
-+     printf("Allocated %d\n",coefs_per_row);
-+-- 
-+2.5.0
-+
-+
-+From 6a0001e44872f9333caf6c6e7e5046cd56a3a21a Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 16:40:51 +0100
-+Subject: [PATCH 38/68] Add copy of av_mod_uintp2 for use with stable ffmpeg
-+
-+---
-+ libavcodec/hevc.c | 8 ++++++++
-+ 1 file changed, 8 insertions(+)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index eee22eb..cfdf6c2 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -49,6 +49,14 @@
-+ 
-+ // #define DISABLE_MC
-+ 
-++#ifndef av_mod_uintp2
-++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-++{
-++    return a & ((1 << p) - 1);
-++}
-++#   define av_mod_uintp2   av_mod_uintp2_c
-++#endif
-++
-+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ 
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 6fbc046c87e413d38c789e82f73dfece27a64ff4 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 18 May 2015 11:11:02 +0100
-+Subject: [PATCH 39/68] Added support for weighted prediction in P frames
-+
-+---
-+ libavcodec/hevc.c          |  52 ++++-
-+ libavcodec/rpi_shader.c    | 566 +++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |  12 +-
-+ libavcodec/rpi_shader.qasm |  39 +++-
-+ 4 files changed, 384 insertions(+), 285 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index cfdf6c2..0906ac2 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -62,7 +62,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ 
-+ #ifdef RPI_INTER_QPU
-+ 
-+-#define RPI_CHROMA_COMMAND_WORDS 10
-++#define RPI_CHROMA_COMMAND_WORDS 12
-+ #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+@@ -2018,6 +2018,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+                 int chan = x0>>8;
-++                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+@@ -2030,6 +2032,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-++                      if (weight_flag) {
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0] & 0xffff);
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1] & 0xffff);
-++                      } else {
-++                          *u++ = 1; // Weight of 1 and offset of 0
-++                          *u++ = 1;
-++                      }
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2072,6 +2081,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+                 int chan = x0>>8;
-++                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+@@ -2085,6 +2096,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-++                      if (weight_flag) {
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
-++                      } else {
-++                          *u++ = 1; // Weight of 1 and offset of 0
-++                          *u++ = 1;
-++                      }
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2146,6 +2164,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-++                      u+=2; // Weights not supported in B slices
-+                       u+=2; // Intermediate results are not written back in first pass of B filtering
-+ 
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+@@ -2156,6 +2175,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx2][0];
-+                       *u++ = rpi_filter_coefs[_my2][0];
-++                      u+=2; // Weights not supported in B slices
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2782,6 +2802,9 @@ static void rpi_inter_clear(HEVCContext *s)
-+     int i;
-+     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-++    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-++
-+     for(i=0;i<8;i++) {
-+         s->u_mvs[i] = s->mvs_base[i];
-+         *s->u_mvs[i]++ = 0;
-+@@ -2793,6 +2816,13 @@ static void rpi_inter_clear(HEVCContext *s)
-+         *s->u_mvs[i]++ = pic_height;
-+         *s->u_mvs[i]++ = s->frame->linesize[1];
-+         *s->u_mvs[i]++ = s->frame->linesize[2];
-++        if (weight_flag) {
-++            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-++            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-++        } else {
-++            *s->u_mvs[i]++ = 1 << 5;
-++            *s->u_mvs[i]++ = 6;
-++        }
-+         s->u_mvs[i] += 1;  // Padding words
-+     }
-+ }
-+@@ -2836,12 +2866,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-+ #ifdef RPI
-++#ifdef RPI_INTER_QPU
-+     s->enable_rpi = s->ps.sps->bit_depth == 8
-+                     && s->ps.sps->width <= RPI_MAX_WIDTH
-+                     && !s->ps.pps->cross_component_prediction_enabled_flag
-+                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
-+-                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-+                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
-++#else
-++    s->enable_rpi = s->ps.sps->bit_depth == 8
-++                    && s->ps.sps->width <= RPI_MAX_WIDTH
-++                    && !s->ps.pps->cross_component_prediction_enabled_flag
-++                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
-++#endif
-++
-++    /*if (!s->enable_rpi) {
-++      if (s->ps.pps->cross_component_prediction_enabled_flag)
-++        printf("Cross component\n");
-++      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
-++        printf("Tiles\n");
-++      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-++        printf("Weighted P slice\n");
-++      if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-++        printf("Weighted B slice\n");
-++    }*/
-+ 
-+ #endif
-+ 
-+@@ -2974,6 +3021,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-+ 
-+ #ifdef RPI
-+     s->enable_rpi = 0;
-++    //printf("Wavefront\n");
-+ #endif
-+ 
-+     if(ctb_row) {
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index b0b93b5..3f04d80 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -88,287 +88,307 @@ unsigned int rpi_shader[] = {
-+ /* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+ /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+ /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-++/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-++/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
-++/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
-++/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
-++/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
-++/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 99927c4..cec9901 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,11 +4,11 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 148)
-+-#define mc_filter_uv_b0 (rpi_shader + 310)
-+-#define mc_filter_uv_b (rpi_shader + 458)
-+-#define mc_exit (rpi_shader + 630)
-+-#define mc_interrupt_exit8 (rpi_shader + 648)
-+-#define mc_end (rpi_shader + 678)
-++#define mc_filter_uv (rpi_shader + 152)
-++#define mc_filter_uv_b0 (rpi_shader + 342)
-++#define mc_filter_uv_b (rpi_shader + 494)
-++#define mc_exit (rpi_shader + 670)
-++#define mc_interrupt_exit8 (rpi_shader + 688)
-++#define mc_end (rpi_shader + 718)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index d9ffcda..97c4c02 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -9,7 +9,12 @@
-+ #                                               (ra15 isn't clamped to zero - this happens during the
-+ #                                                copy to ra14, and during its use in the vertical filter)
-+ #
-+-# rb8...rb15                                    eight vertical filter coefficients
-++# rb8...rb11                                    eight vertical filter coefficients
-++
-++# rb12 offset to add before shift
-++# rb13 shift
-++# rb14 weight (U on left, V on right)
-++# rb15 offset (U on left, V on right)
-+ #
-+ # ra16                                          clipped(row start address+elem_num)&~3
-+ # ra17                                          per-channel shifts
-+@@ -165,6 +170,9 @@ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+ add t0s, r0, r1 ; mov ra_x2_base, r2
-+ add t0s, r2, r1
-+ 
-++mov rb12,unif # offset before shift
-++mov rb13,unif # offset after shift
-++
-+ # Dump padding words
-+ mov r0, unif
-+ 
-+@@ -231,11 +239,21 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23
-+ 
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r0, unif # U offset/weight
-++asr rb15, r0, r2  # Compute offset from MSBs
-++shl r0, r0, r2
-++asr rb14, r0, r2  # Compute weight from LSBs
-++mov r0, unif # V offset/weight
-++asr.ifnz rb15, r0, r2
-++shl r0, r0, r2
-++asr.ifnz rb14, r0, r2
-++
-+ # r2 is elem_num
-+ # r3 is loop counter
-+ 
-+ mov r5rep, -8
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+ # retrieve texture results and pick out bytes
-+ # then submit two more texture requests
-+@@ -279,6 +297,11 @@ mov ra13, ra14       # Delay slot 1
-+ mov ra14, ra15       # Delay slot 2
-+ mov ra15, r0         # Delay slot 3
-+ 
-++mov rb12,32
-++mov rb13,6
-++mov rb14,1
-++mov rb15,0
-++
-+ # apply vertical filter and write to VPM
-+ 
-+ nop                     ; mul24 r1, ra14, rb10
-+@@ -288,9 +311,11 @@ add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ asr r1, r1, 14
-+-add r1, r1, ra21
-++nop                     ; mul24 r1, r1, rb14
-++add r1, r1, rb12
-++asr r1, r1, rb13
-+ brr.anyn -, r:uvloop
-+-asr r1, r1, 6          # Delay 1
-++add r1, r1, rb15       # Delay 1
-+ min r1, r1, rb22       # Delay 2
-+ max vpm, r1, 0         # Delay 3
-+ 
-+@@ -364,6 +389,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23
-+ 
-++mov r0, unif # U offset/weight
-++mov r0, unif # V offset/weight
-++
-+ # r2 is elem_num
-+ # r3 is loop counter
-+ 
-+@@ -491,6 +519,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23
-+ 
-++mov r0, unif # U offset/weight
-++mov r0, unif # V offset/weight
-++
-+ # r2 is elem_num
-+ # r3 is loop counter
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 21506fa30a0953a1caa467ea31f9ee2ebbdf5b79 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 19 May 2015 08:43:30 +0100
-+Subject: [PATCH 40/68] Improved ordering of tasks
-+
-+---
-+ libavcodec/hevc.c | 8 ++++----
-+ 1 file changed, 4 insertions(+), 4 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 0906ac2..1cc7900 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2930,15 +2930,15 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-+           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-+           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+-#ifdef RPI_INTER_QPU
-+-            // Kick off inter prediction on QPUs
-+-            rpi_execute_inter_qpu(s);
-+-#endif
-+             // Transform all blocks
-+             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-++#ifdef RPI_INTER_QPU
-++            // Kick off inter prediction on QPUs
-++            rpi_execute_inter_qpu(s);
-++#endif
-+             // Wait for transform completion
-+             vpu_wait(s->vpu_id);
-+ 
-+-- 
-+2.5.0
-+
-+
-+From c0437176b4799453c7731e8372c622c03d65db1b Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 20 May 2015 19:58:19 +0100
-+Subject: [PATCH 41/68] Drafted Luma inter prediction
-+
-+---
-+ libavcodec/rpi_shader.qasm | 594 ++++++++++++++++++++++++++++++++++++++++++---
-+ 1 file changed, 554 insertions(+), 40 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 97c4c02..9cfc0d9 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -2,7 +2,10 @@
-+ #
-+ # ra0...ra7                                     eight horizontal filter coefficients
-+ #
-+-# rb1...rb7                                     seven shifted copies of the current unfiltered row
-++# rb0 rx_shift2
-++# rb1 ra_y2_next
-++#
-++# rb4...rb7
-+ #
-+ # ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
-+ #
-+@@ -26,9 +29,9 @@
-+ # rb19                                          next ra16
-+ #
-+ # ra20                                          1
-+-# ra21                                          32
-++# ra21                                          ra_21
-+ # ra22                                          256
-+-# ra23                                          8
-++# ra23                                          rx_shift2_next
-+ #
-+ # rb20                                          0xffffff00
-+ # rb21                                          vpm_setup for reading/writing 16bit results into VPM
-+@@ -57,16 +60,23 @@
-+ .set rb_frame_width_minus_1,       rb25
-+ .set rb_frame_height_minus_1,      rb30
-+ .set rb_pitch,                     rb16
-+-.set ra_x_base,                    ra16
-+-.set rb_x_base_next,               rb19
-+-.set ra_x2_base,                   ra24
-+-.set ra_x2_base_next,              ra26
-++.set ra_x,                         ra16
-++.set ra_y2,                        ra21
-++.set ra_y2_next,                   rb1
-++
-++.set rb_x_next,                    rb19
-++.set rx_frame_base2_next,          rb19
-++
-++.set ra_frame_base,                ra24
-++.set ra_frame_base_next,           ra26
-+ .set ra_xshift,                    ra17
-+ 
-+-.set ra_x2shift,                   ra25
-+ .set ra_u2v_ref_offset,            ra25
-++.set ra_frame_base2,               ra25
-+ 
-+ .set ra_xshift_next,               ra19
-++.set rx_xshift2,                   rb0
-++.set rx_xshift2_next,              ra23
-+ 
-+ .set ra_x2shift_next,              ra27
-+ .set ra_u2v_dst_offset,            ra27
-+@@ -83,11 +93,11 @@
-+ mov ra31, unif
-+ 
-+ # Load first request location
-+-add ra_x_base, unif, elem_num # Store x
-++add ra_x, unif, elem_num # Store x
-+ mov ra_y, unif # Store y
-+-mov ra_x2_base, unif # Store frame u base
-++mov ra_frame_base, unif # Store frame u base
-+ nop
-+-sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-++sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
-+ 
-+ # Read image dimensions
-+ sub rb25,unif,1
-+@@ -104,9 +114,7 @@ add rb24, r1, r0
-+ # load constants
-+ 
-+ mov ra20, 1
-+-mov ra21, 32
-+ mov ra22, 256
-+-mov ra23, 8
-+ mov ra30, 64
-+ 
-+ mov rb20, 0xffffff00
-+@@ -156,18 +164,18 @@ mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which i
-+ add rb21, r0, r1
-+ 
-+ # Compute base address for first and second access
-+-mov r0, ra_x_base           # Load x
-++mov r0, ra_x           # Load x
-+ max r0, r0, 0; mov r1, ra_y # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
-+ shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+ add ra_y, r1, 1
-+ add r0, r0, r3
-+ and r0, r0, ~3
-+-max r1, r1, 0 ; mov ra_x_base, r0 # y
-++max r1, r1, 0 ; mov ra_x, r0 # y
-+ min r1, r1, rb_frame_height_minus_1
-+ # submit texture requests for first line
-+ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-add t0s, r0, r1 ; mov ra_x2_base, r2
-++add t0s, r0, r1 ; mov ra_frame_base, r2
-+ add t0s, r2, r1
-+ 
-+ mov rb12,unif # offset before shift
-+@@ -182,8 +190,8 @@ min r1, r1, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1
-+ bra -, ra31
-+ nop ; mul24 r1, r1, rb_pitch
-+-add t0s, r1, ra_x_base
-+-add t0s, r1, ra_x2_base
-++add t0s, r1, ra_x
-++add t0s, r1, ra_frame_base
-+ 
-+ 
-+ 
-+@@ -192,7 +200,7 @@ add t0s, r1, ra_x2_base
-+ # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+ 
-+ # At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-++# ra_x, ra_x16_base point to the current coordinates for this block
-+ ::mc_filter_uv
-+ mov ra31, unif
-+ 
-+@@ -207,9 +215,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+ shl ra_xshift_next, r0, 3
-+ sub r2, unif, r3 # compute offset from frame base u to frame base v
-+ add r0, r0, r3
-+-and rb_x_base_next, r0, ~3
-++and rb_x_next, r0, ~3
-+ mov ra_y_next, r1
-+-add ra_x2_base_next, rb_x_base_next, r2
-++add ra_frame_base_next, rb_x_next, r2
-+ 
-+ # set up VPM write
-+ mov vw_setup, rb28
-+@@ -265,16 +273,16 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+ 
-+ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -297,7 +305,7 @@ mov ra13, ra14       # Delay slot 1
-+ mov ra14, ra15       # Delay slot 2
-+ mov ra15, r0         # Delay slot 3
-+ 
-+-mov rb12,32
-++mov rb12,32 # TODO remove these to make P weighted prediction work properly
-+ mov rb13,6
-+ mov rb14,1
-+ mov rb15,0
-+@@ -342,7 +350,7 @@ mov vw_addr, unif # start the VDW
-+ # mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+ 
-+ # At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-++# ra_x, ra_x16_base point to the current coordinates for this block
-+ ::mc_filter_uv_b0
-+ mov ra31, unif
-+ 
-+@@ -357,9 +365,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+ shl ra_xshift_next, r0, 3
-+ sub r2, unif, r3 # compute offset from frame base u to frame base v
-+ add r0, r0, r3
-+-and rb_x_base_next, r0, ~3
-++and rb_x_next, r0, ~3
-+ mov ra_y_next, r1
-+-add ra_x2_base_next, rb_x_base_next, r2
-++add ra_frame_base_next, rb_x_next, r2
-+ 
-+ # set up VPM write, we need to save 16bit precision
-+ mov vw_setup, rb21
-+@@ -408,16 +416,16 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+ 
-+ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -477,9 +485,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+ shl ra_xshift_next, r0, 3
-+ sub r2, unif, r3 # compute offset from frame base u to frame base v
-+ add r0, r0, r3
-+-and rb_x_base_next, r0, ~3
-++and rb_x_next, r0, ~3
-+ mov ra_y_next, r1
-+-add ra_x2_base_next, rb_x_base_next, r2
-++add ra_frame_base_next, rb_x_next, r2
-+ 
-+ # set up VPM write
-+ mov vw_setup, rb28
-+@@ -538,16 +546,16 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+ 
-+ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -642,5 +650,511 @@ nop        ; nop ; thrend
-+ mov interrupt, 1; nop # delay slot 1
-+ nop        ; nop # delay slot 2
-+ 
-++
-++
-++
-++
-++# LUMA CODE
-++
-++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-++# For P frames we make the second x,y coordinates offset by +8
-++
-++################################################################################
-++# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
-++::mc_setup
-++
-++# Read starting kernel
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++mov ra31, unif
-++
-++# Compute base address for first and second access
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl ra_xshift_next, r0, 3 # Compute shifts
-++add ra_y, r1, 1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-++max r1, r1, 0
-++min r1, r1, rb_frame_height_minus_1
-++nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-++add t0s, r2, r1 ; mov ra_frame_base, r2
-++
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl rx_xshift2_next, r0, 3 # Compute shifts
-++add ra_y2, r1, 1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-++max r1, r1, 0
-++min r1, r1, rb_frame_height_minus_1
-++nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-++add t0s, r2, r1 ; mov ra_frame_base2, r2
-++
-++
-++# Read image dimensions
-++sub rb25,unif,1
-++sub rb30,unif,1
-++
-++# get source pitch
-++mov rb16, unif
-++
-++# get destination pitch
-++mov r0, unif
-++mov r1, vdw_setup_1(0)
-++add rb24, r1, r0
-++
-++# load constants
-++
-++mov ra20, 1
-++mov ra22, 256
-++mov ra30, 64
-++
-++mov rb20, 0xffffff00
-++mov rb22, 255
-++mov rb23, 24
-++
-++# touch vertical context to keep simulator happy
-++
-++mov ra8, 0
-++mov ra9, 0
-++mov ra10, 0
-++mov ra11, 0
-++mov ra12, 0
-++mov ra13, 0
-++mov ra14, 0
-++mov ra15, 0
-++
-++# Compute part of VPM to use for DMA output
-++mov r2, qpu_num
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++shl r0, r0, 5
-++add rb27, r0, r1
-++
-++# Compute part of VPM to save data into
-++mov r2, qpu_num   # qpu_num = abcd
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-++add rb28, r0, r1
-++
-++mov rb12,unif # offset before shift
-++mov rb13,unif # shift
-++
-++# Dump padding words
-++mov r0, unif
-++
-++# submit texture requests for second line
-++max r1, ra_y, 0
-++min r1, r1, rb_frame_height_minus_1
-++add ra_y, ra_y, 1
-++nop ; mul24 r1, r1, rb_pitch
-++add t0s, r1, ra_frame_base
-++
-++max r1, ra_y2, 0
-++min r1, r1, rb_frame_height_minus_1
-++bra -, ra31
-++add ra_y2, ra_y2, 1           # Delay 1
-++nop ; mul24 r1, r1, rb_pitch  # Delay 2
-++add t0s, r1, ra_frame_base2   # Delay 3
-++
-++
-++################################################################################
-++
-++# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-++# In a P block, only the first half of coefficients contain used information.
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x, ra_x16_base point to the current coordinates for this block
-++::mc_filter
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov rx_xshift2, rx_xshift2_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl ra_xshift_next, r0, 3 # Compute shifts
-++mov ra_y_next, r1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-++
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0   ; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl rx_xshift2_next, r0, 3 # Compute shifts
-++add ra_y2_next, r1, 1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-++
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++# get filter coefficients and discard unused B frame values
-++mov r0, unif
-++mov.ifnz -, unif # Alternate coefficients are unused for P frames
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++mov.ifnz -, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++mov.ifnz -, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++mov.ifnz -, unif
-++asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++asr rb4, r0, rb23
-++
-++mov r0, unif # Frame0 offset/weight
-++mov.ifnz -, unif # Frame1 offset/weight unused
-++asr rb15, r0, r2  # Compute offset from MSBs
-++shl r0, r0, r2
-++asr rb14, r0, r2  # Compute weight from LSBs
-++
-++# r3 is loop counter
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:yloop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++# If we knew there was no clipping then this code would get simpler.
-++# Perhaps we could add on the pitch and clip using larger values?
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, rx_xshift2
-++mov.ifz ra_y2, ra_y2_next
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-++
-++max r2, ra_y2, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# apply horizontal filter
-++nop                  ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 8    ; mov ra12, ra13
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++brr.anyn -, r:yloop
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-++add r1, r1, r0          ; mul24 r0, ra8, rb4
-++add r1, r1, r0          ; mul24 r0, ra9, rb5
-++add r1, r1, r0          ; mul24 r0, ra10, rb6
-++add r1, r1, r0          ; mul24 r0, ra11, rb7
-++
-++add r1, r1, r0          ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 14
-++nop                     ; mul24 r1, r1, rb14
-++add r1, r1, rb12
-++asr r1, r1, rb13
-++brr.anyn -, r:yloop
-++add r1, r1, rb15       # Delay 1
-++min r1, r1, rb22       # Delay 2
-++max vpm, r1, 0         # Delay 3
-++
-++# DMA out
-++
-++bra -, ra31
-++mov vw_setup, rb26 # VDW setup 0    Delay 1
-++mov vw_setup, rb29 # Stride         Delay 2
-++mov vw_addr, unif # start the VDW   Delay 3
-++
-++
-++
-++################################################################################
-++
-++# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-++# In a P block, only the first half of coefficients contain used information.
-++# At this point we have already issued two pairs of texture requests for the current block
-++# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-++# Can fill in the coefficients so only
-++# Can also assume default weighted prediction for B frames.
-++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
-++# Or possibly by taking advantage of symmetry?
-++# From 19->7 32bits per command.
-++::mc_filter_b
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov rx_xshift2, rx_xshift2_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl ra_xshift_next, r0, 3 # Compute shifts
-++mov ra_y_next, r1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-++
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0   ; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl rx_xshift2_next, r0, 3 # Compute shifts
-++add ra_y2_next, r1, 1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-++
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++# get filter coefficients and discard unused B frame values
-++mov r0, unif
-++mov r1, 1
-++mov.ifnz r0, unif # Alternate coefficients are unused for P frames
-++nop              ;      mul24 r0, r0 << 13, r1 << 13
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 14, r1 << 14
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++mov.ifnz r0, unif
-++nop              ;      mul24 r0, r0 << 9, r1 << 9
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 10, r1 << 10
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 11, r1 << 11
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 12, r1 << 12
-++asr ra4, r0, rb23;      mov r0, unif
-++mov.ifnz r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++mov.ifnz r0, unif
-++asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++asr rb4, r0, rb23
-++
-++mov r0, unif # Frame0 offset/weight
-++mov.ifnz r0, unif # Frame1 offset/weight unused
-++asr rb15, r0, r2  # Compute offset from MSBs
-++shl r0, r0, r2
-++asr rb14, r0, r2  # Compute weight from LSBs
-++
-++# r3 is loop counter
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:yloopb
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++# If we knew there was no clipping then this code would get simpler.
-++# Perhaps we could add on the pitch and clip using larger values?
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, rx_xshift2
-++mov.ifz ra_y2, ra_y2_next
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-++
-++max r2, ra_y2, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# apply horizontal filter
-++nop                  ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 8    ; mov ra12, ra13
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++brr.anyn -, r:yloopb
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-++add r1, r1, r0          ; mul24 r0, ra8, rb4
-++add r1, r1, r0          ; mul24 r0, ra9, rb5
-++add r1, r1, r0          ; mul24 r0, ra10, rb6
-++add r1, r1, r0          ; mul24 r0, ra11, rb7
-++
-++add r1, r1, r0          ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 14
-++nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
-++add r1, r1, ra30        ; mul24 r0, r1, rb14
-++add r1, r1, r0
-++brr.anyn -, r:yloopb
-++asr r1, r1, 7          # Delay 1
-++min r1, r1, rb22       # Delay 2
-++max vpm, r1, 0         # Delay 3
-++
-++# DMA out
-++bra -, ra31
-++mov vw_setup, rb26 # VDW setup 0    Delay 1
-++mov vw_setup, rb29 # Stride         Delay 2
-++mov vw_addr, unif # start the VDW   Delay 3
-++
-++################################################################################
-++
-++# mc_interrupt_exit12()
-++::mc_interrupt_exit12
-++mov  -, vw_wait # wait on the VDW
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++mov -,sacq(0) # 1
-++mov -,sacq(0) # 2
-++mov -,sacq(0) # 3
-++mov -,sacq(0) # 4
-++mov -,sacq(0) # 5
-++mov -,sacq(0) # 6
-++mov -,sacq(0) # 7
-++mov -,sacq(0) # 8
-++mov -,sacq(0) # 9
-++mov -,sacq(0) # 10
-++mov -,sacq(0) # 11
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++
-+ ::mc_end
-+ # Do not add code here because mc_end must appear after all other code.
-+-- 
-+2.5.0
-+
-+
-+From 3a5492970d13bf5ffe94898d59b3e882e7c8a1f5 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 20 May 2015 19:58:30 +0100
-+Subject: [PATCH 42/68] Added support for fast cache flush in deblocker
-+
-+---
-+ libavcodec/hevc_filter.c   |   44 +-
-+ libavcodec/rpi_qpu.c       |    6 +
-+ libavcodec/rpi_qpu.h       |    2 +
-+ libavcodec/rpi_shader.c    | 1028 +++++++++++++++++++++++++++++---------------
-+ libavcodec/rpi_shader.h    |   16 +-
-+ libavcodec/rpi_user_vcsm.h |   22 +
-+ 6 files changed, 768 insertions(+), 350 deletions(-)
-+
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 92a8271..186317a 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -37,6 +37,11 @@
-+ 
-+ #include "bit_depth_template.c"
-+ 
-++#ifdef RPI
-++#include "rpi_user_vcsm.h"
-++#include "rpi_qpu.h"
-++#endif
-++
-+ #define LUMA 0
-+ #define CB 1
-+ #define CR 2
-+@@ -872,15 +877,46 @@ static void flush_buffer(AVBufferRef *bref) {
-+     gpu_cache_flush(p);
-+ }
-+ 
-+-static void ff_hevc_flush_chroma(HEVCContext *s)
-++// Return Physical address for this image
-++static int ff_hevc_buf_base(AVBufferRef *bref) {
-++  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++  return p->vc & 0x3fffffff;
-++}
-++
-++static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+             s->nal_unit_type == NAL_TSA_N   ||
-+             s->nal_unit_type == NAL_STSA_N  ||
-+             s->nal_unit_type == NAL_RADL_N  ||
-+             s->nal_unit_type == NAL_RASL_N )) {
-++#define RPI_FAST_CACHEFLUSH
-++#ifdef RPI_FAST_CACHEFLUSH
-++        struct vcsm_user_clean_invalid_s iocache = {};
-++        int curr_y = f->progress->data[0];
-++        int sz,base;
-++        if (curr_y < 0) curr_y = 0;
-++        if (n<=curr_y) return; // Should not happen
-++        sz = s->frame->linesize[1] * (n-curr_y);
-++        base = s->frame->linesize[1] * curr_y;
-++        iocache.s[0].cmd = 3; // Flush L1 cache
-++        iocache.s[0].addr = 0;
-++        iocache.s[0].size  = 0;
-++
-++        iocache.s[1].cmd = 2;
-++        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
-++        iocache.s[1].size  = sz;
-++
-++        iocache.s[2].cmd = 2;
-++        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
-++        iocache.s[2].size  = sz;
-++
-++        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
-++
-++#else
-+         flush_buffer(s->frame->buf[1]);
-+         flush_buffer(s->frame->buf[2]);
-++#endif
-+         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+         //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+@@ -903,7 +939,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x, y - ctb_size);
-+             if (s->threads_type & FF_THREAD_FRAME ) {
-+ #ifdef RPI_INTER_QPU
-+-                ff_hevc_flush_chroma(s);
-++                ff_hevc_flush_chroma(s,&s->ref->tf, y);
-+ #endif
-+                 ff_thread_report_progress(&s->ref->tf, y, 0);
-+             }
-+@@ -912,7 +948,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x , y);
-+             if (s->threads_type & FF_THREAD_FRAME ) {
-+ #ifdef RPI_INTER_QPU
-+-                ff_hevc_flush_chroma(s);
-++                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
-+ #endif
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+             }
-+@@ -922,7 +958,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-+ #ifdef RPI_INTER_QPU
-+-        ff_hevc_flush_chroma(s);
-++        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index f62051f..fd8a276 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -237,6 +237,12 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+   return r;
-+ }
-+ 
-++int gpu_get_mailbox(void)
-++{
-++  assert(gpu);
-++  return gpu->mb;
-++}
-++
-+ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ {
-+   void *tmp = vcsm_lock(p->vcsm_handle);
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 543c84b..88965e5 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -49,4 +49,6 @@ extern int rpi_test_shader(void);
-+ extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
-+ extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
-+ 
-++extern int gpu_get_mailbox(void);
-++
-+ #endif
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 3f04d80..9c30e32 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -23,11 +23,11 @@ __attribute__((aligned(8)))
-+ unsigned int rpi_shader[] = {
-+ // ::mc_setup_uv
-+ /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-++/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
-+ /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+-/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
-+ /* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-++/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
-+ /* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+ /* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+ /* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+@@ -35,360 +35,708 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+ /* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+ /* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+-/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+-/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+-/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+-/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+-/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000070] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000078] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-++/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-++/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-++/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-++/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-++/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-++/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-++/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+ // ::mc_filter_uv
-+-/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+-/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+-/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-++/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-++/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
-+-/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
-+-/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+-/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+-/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
-++/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
-++/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
-++/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
-++/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_setup
-++/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-++/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-++/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-++/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-++/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-++/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-++// ::mc_filter
-++/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-++/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-++/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-++/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-++/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-++/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :yloop
-++/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-++/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_b
-++/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-++/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-++/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-++/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-++/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
-++/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-++/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-++/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-++/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-++/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-++/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-++/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-++/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-++/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :yloopb
-++/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-++/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-++/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-++/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_interrupt_exit12
-++/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index cec9901..3fa8531 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,11 +4,15 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 342)
-+-#define mc_filter_uv_b (rpi_shader + 494)
-+-#define mc_exit (rpi_shader + 670)
-+-#define mc_interrupt_exit8 (rpi_shader + 688)
-+-#define mc_end (rpi_shader + 718)
-++#define mc_filter_uv (rpi_shader + 148)
-++#define mc_filter_uv_b0 (rpi_shader + 338)
-++#define mc_filter_uv_b (rpi_shader + 490)
-++#define mc_exit (rpi_shader + 666)
-++#define mc_interrupt_exit8 (rpi_shader + 684)
-++#define mc_setup (rpi_shader + 714)
-++#define mc_filter (rpi_shader + 868)
-++#define mc_filter_b (rpi_shader + 1108)
-++#define mc_interrupt_exit12 (rpi_shader + 1364)
-++#define mc_end (rpi_shader + 1402)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-+index fbebbbe..95e6de1 100644
-+--- a/libavcodec/rpi_user_vcsm.h
-++++ b/libavcodec/rpi_user_vcsm.h
-+@@ -418,6 +418,28 @@ int vcsm_unlock_hdl( unsigned int handle );
-+ */
-+ int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+ 
-++/* Clean and/or invalidate the memory associated with this user opaque handle
-++**
-++** Returns:        non-zero on error
-++**
-++** structure contains a list of flush/invalidate commands. Commands are:
-++** 0: nop
-++** 1: invalidate given physical range in L2
-++** 2: clean      given physical range in L2
-++** 3: clean+invalidate all of L1
-++** 4: flush      all of L2 and all of L1
-++*/
-++struct vcsm_user_clean_invalid_s {
-++    struct {
-++       unsigned int cmd;
-++       unsigned int addr;
-++       unsigned int size;
-++    } s[8];
-++};
-++
-++int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
-++
-++
-+ #ifdef __cplusplus
-+ }
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From 855a757b894c53ddeadeb94c74bd11d3b2fa1fd3 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 20 May 2015 21:12:55 +0100
-+Subject: [PATCH 43/68] Added multi mailbox - not working
-+
-+---
-+ libavcodec/hevc.c        | 40 ++++++++++++++++++++++++++++---
-+ libavcodec/rpi_mailbox.c | 47 +++++++++++++++++++++++++++++++++++++
-+ libavcodec/rpi_mailbox.h |  5 ++++
-+ libavcodec/rpi_qpu.c     | 61 ++++++++++++++++++++++++++++++++++++++++++++----
-+ libavcodec/rpi_qpu.h     |  2 ++
-+ 5 files changed, 147 insertions(+), 8 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 1cc7900..9bf0d28 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -45,6 +45,11 @@
-+   //#define EARLY_MALLOC
-+   // Move Inter prediction into separate pass
-+   #define RPI_INTER
-++
-++  #ifdef RPI_INTER_QPU
-++    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-++    #define RPI_MULTI_MAILBOX
-++  #endif
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -2830,10 +2835,14 @@ static void rpi_inter_clear(HEVCContext *s)
-+ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ {
-+     int k;
-++    int i;
-+     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-+-
-+-    if (s->sh.slice_type == I_SLICE)
-+-        return;
-++    if (s->sh.slice_type == I_SLICE) {
-++#ifdef RPI_MULTI_MAILBOX
-++      rpi_execute_transform(s);
-++      return;
-++#endif
-++    }
-+     for(k=0;k<8;k++) {
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+@@ -2843,6 +2852,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ 
-++#ifdef RPI_MULTI_MAILBOX
-++    gpu_cache_flush(&s->coeffs_buf_accelerated);
-++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-++                                   qpu_get_fn(QPU_MC_SETUP_UV),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++                                 );
-++    for(i=0;i<4;i++)
-++        s->num_coeffs[i] = 0;
-++#else
-+     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+       (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+       (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+@@ -2853,6 +2878,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+       (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+       );
-++#endif
-+ }
-+ #endif
-+ 
-+@@ -2932,6 +2958,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+             // Transform all blocks
-+             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-++#ifdef RPI_MULTI_MAILBOX
-++            // Kick off inter prediction on QPUs
-++            rpi_execute_inter_qpu(s);
-++            // Perform luma inter prediction
-++            rpi_execute_inter_cmds(s);
-++#else
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-+@@ -2939,6 +2971,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+             // Kick off inter prediction on QPUs
-+             rpi_execute_inter_qpu(s);
-+ #endif
-++#endif
-++
-+             // Wait for transform completion
-+             vpu_wait(s->vpu_id);
-+ 
-+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-+index 77a56dd..3904efc 100644
-+--- a/libavcodec/rpi_mailbox.c
-++++ b/libavcodec/rpi_mailbox.c
-+@@ -276,6 +276,53 @@ unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigne
-+    return p[5];
-+ }
-+ 
-++void execute_multi(int file_desc,
-++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
-++   int i=0;
-++   unsigned p[32];
-++
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++   p[i++] = 0x30018; // (the tag id)
-++   p[i++] = 88; // (size of the buffer)
-++   p[i++] = 88; // (size of the data)
-++
-++   p[i++] = num_qpus;
-++   p[i++] = control;
-++   p[i++] = noflush;
-++   p[i++] = timeout; // ms
-++
-++   p[i++] = num_qpus_2;
-++   p[i++] = control_2;
-++   p[i++] = noflush_2;
-++   p[i++] = timeout_2; // ms
-++
-++   p[i++] = code;
-++   p[i++] = r0;
-++   p[i++] = r1;
-++   p[i++] = r2;
-++   p[i++] = r3;
-++   p[i++] = r4;
-++   p[i++] = r5;
-++
-++   p[i++] = code_2;
-++   p[i++] = r0_2;
-++   p[i++] = r1_2;
-++   p[i++] = r2_2;
-++   p[i++] = r3_2;
-++   p[i++] = r4_2;
-++   p[i++] = r5_2;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return;
-++}
-++
-+ int mbox_open() {
-+    int file_desc;
-+ 
-+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-+index c264d2e..5898102 100644
-+--- a/libavcodec/rpi_mailbox.h
-++++ b/libavcodec/rpi_mailbox.h
-+@@ -15,6 +15,11 @@ extern void unmapmem(void *addr, unsigned size);
-+ 
-+ extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+ extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-++extern void execute_multi(int file_desc,
-++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
-+ extern unsigned qpu_enable(int file_desc, unsigned enable);
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index fd8a276..feb3284 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -123,7 +123,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
-+ static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
-+ static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ 
-+-static int vpu_cmds[MAXCMDS][8];
-++static int vpu_cmds[MAXCMDS][16];
-+ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-+ static volatile int vpu_async_head=0;
-+ #endif
-+@@ -346,6 +346,7 @@ unsigned int vpu_get_constants(void) {
-+ static void *vpu_start(void *arg) {
-+   while(1) {
-+     int *p;
-++    int qpu_code;
-+     pthread_mutex_lock(&post_mutex);
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+@@ -358,12 +359,25 @@ static void *vpu_start(void *arg) {
-+     if (p[6] == -1) {
-+       break; // Last job
-+     }
-+-    if (p[7]) {
-++    qpu_code = p[7];
-++    //if (p[7]) {
-+         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+         //gpu_cache_flush(buf);
-+-    }
-+-    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-++    //}
-++    if (!qpu_code) {
-++      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-++    } else {
-++      int i;
-++      for(i=0;i<8;i++) {
-++        gpu->mail[i*2] = p[8+i];
-++        gpu->mail[i*2 + 1] = qpu_code;
-++      }
-+ 
-++      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-++                              0, 0, 0, 0,
-++                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-++                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-++    }
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+     pthread_cond_broadcast(&post_cond_head);
-+@@ -400,7 +414,43 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
-+     p[4] = r3;
-+     p[5] = r4;
-+     p[6] = r5;
-+-    p[7] = (int) buf;
-++    p[7] = 0;
-++    if (num<=1)
-++      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-++    pthread_mutex_unlock(&post_mutex);
-++    return id;
-++  }
-++}
-++
-++int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-++                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-++{
-++
-++  pthread_mutex_lock(&post_mutex);
-++  {
-++    int id = vpu_async_tail++;
-++    int *p = vpu_cmds[id%MAXCMDS];
-++    int num = vpu_async_tail - vpu_async_head;
-++    if (num>MAXCMDS) {
-++      printf("Too many commands submitted\n");
-++      exit(-1);
-++    }
-++    p[0] = vpu_code;
-++    p[1] = r0;
-++    p[2] = r1;
-++    p[3] = r2;
-++    p[4] = r3;
-++    p[5] = r4;
-++    p[6] = r5;
-++    p[7] = qpu_code;
-++    p[8 ] = unifs1;
-++    p[9 ] = unifs2;
-++    p[10] = unifs3;
-++    p[11] = unifs4;
-++    p[12] = unifs5;
-++    p[13] = unifs6;
-++    p[14] = unifs7;
-++    p[15] = unifs8;
-+     if (num<=1)
-+       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-+     pthread_mutex_unlock(&post_mutex);
-+@@ -966,6 +1016,7 @@ void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, i
-+ }
-+ 
-+ 
-++
-+ #endif
-+ 
-+ #endif // RPI
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 88965e5..2f08f03 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -41,6 +41,8 @@ extern unsigned int vpu_get_fn(void);
-+ extern unsigned int vpu_get_constants(void);
-+ extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+ extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-++int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-++                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+ extern void vpu_wait( int id);
-+ 
-+ // Simple test of shader code
-+-- 
-+2.5.0
-+
-+
-+From e576989224bf22d2b945e9ded8b27bafe1bd5417 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 21 May 2015 16:50:02 +0100
-+Subject: [PATCH 44/68] Pass qpu number in as uniform
-+
-+---
-+ libavcodec/hevc.c          |    2 +-
-+ libavcodec/rpi_shader.c    | 1288 ++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |   20 +-
-+ libavcodec/rpi_shader.qasm |   10 +-
-+ 4 files changed, 657 insertions(+), 663 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 9bf0d28..25e1cbd 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2821,6 +2821,7 @@ static void rpi_inter_clear(HEVCContext *s)
-+         *s->u_mvs[i]++ = pic_height;
-+         *s->u_mvs[i]++ = s->frame->linesize[1];
-+         *s->u_mvs[i]++ = s->frame->linesize[2];
-++        *s->u_mvs[i]++ = i;
-+         if (weight_flag) {
-+             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-+             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-+@@ -2828,7 +2829,6 @@ static void rpi_inter_clear(HEVCContext *s)
-+             *s->u_mvs[i]++ = 1 << 5;
-+             *s->u_mvs[i]++ = 6;
-+         }
-+-        s->u_mvs[i] += 1;  // Padding words
-+     }
-+ }
-+ 
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 9c30e32..a0f0282 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -48,8 +48,8 @@ unsigned int rpi_shader[] = {
-+ /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+ /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+ /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
-++/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-+ /* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+ /* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+ /* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+@@ -60,669 +60,669 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+ /* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+ /* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+-/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+-/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+-/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+-/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+-/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+-/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+-/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+-/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-++/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-++/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-++/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-++/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-++/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-++/* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-++/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-++/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+ // ::mc_filter_uv
-+-/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+-/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+-/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-++/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-++/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
-+-/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
-+-/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+-/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+-/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
-++/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
-++/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
-++/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
-++/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_setup
-+-/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+-/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+-/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+-/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-+-/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+-/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+-/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-+-/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+-/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+-/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-+-/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-+-/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-++/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-++/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-++/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-++/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-++/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-++/* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-+ // ::mc_filter
-+-/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+-/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+-/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+-/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+-/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+-/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+-/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-++/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-++/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-++/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-++/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-++/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :yloop
-+-/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+-/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+-/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+-/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+-/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+-/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+-/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-+-/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-+-/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-++/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_b
-+-/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+-/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+-/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+-/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+-/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+-/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
-+-/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-+-/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-+-/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-+-/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-+-/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-+-/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-+-/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-+-/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+-/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-++/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-++/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-++/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-++/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
-++/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-++/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-++/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-++/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-++/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-++/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-++/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-++/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-++/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :yloopb
-+-/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+-/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+-/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+-/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+-/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+-/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+-/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-+-/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-+-/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-+-/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-+-/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-++/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-++/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-++/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_interrupt_exit12
-+-/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+@@ -732,11 +732,9 @@ unsigned int rpi_shader[] = {
-+ /* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 3fa8531..6e552d9 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,15 +4,15 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 148)
-+-#define mc_filter_uv_b0 (rpi_shader + 338)
-+-#define mc_filter_uv_b (rpi_shader + 490)
-+-#define mc_exit (rpi_shader + 666)
-+-#define mc_interrupt_exit8 (rpi_shader + 684)
-+-#define mc_setup (rpi_shader + 714)
-+-#define mc_filter (rpi_shader + 868)
-+-#define mc_filter_b (rpi_shader + 1108)
-+-#define mc_interrupt_exit12 (rpi_shader + 1364)
-+-#define mc_end (rpi_shader + 1402)
-++#define mc_filter_uv (rpi_shader + 144)
-++#define mc_filter_uv_b0 (rpi_shader + 334)
-++#define mc_filter_uv_b (rpi_shader + 486)
-++#define mc_exit (rpi_shader + 662)
-++#define mc_interrupt_exit8 (rpi_shader + 680)
-++#define mc_setup (rpi_shader + 710)
-++#define mc_filter (rpi_shader + 864)
-++#define mc_filter_b (rpi_shader + 1104)
-++#define mc_interrupt_exit12 (rpi_shader + 1360)
-++#define mc_end (rpi_shader + 1398)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 9cfc0d9..a0b8e5a 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -133,8 +133,8 @@ mov ra14, 0
-+ mov ra15, 0
-+ 
-+ # Compute part of VPM to use for DMA output
-+-mov r2, qpu_num
-+-shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-++mov r3, unif
-++shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+ and r2, r2, 15
-+ mov r1, r2
-+ asr r1, r1, 2
-+@@ -147,8 +147,7 @@ shl r0, r0, 5
-+ add rb27, r0, r1
-+ 
-+ # Compute part of VPM to save data into
-+-mov r2, qpu_num   # qpu_num = abcd
-+-shl r2, r2, 1
-++shl r2, r3, 1
-+ and r2, r2, 15    # r2 = bcd0
-+ mov r1, r2        # r1 = bcd0
-+ asr r1, r1, 2     # r1 = bc
-+@@ -181,9 +180,6 @@ add t0s, r2, r1
-+ mov rb12,unif # offset before shift
-+ mov rb13,unif # offset after shift
-+ 
-+-# Dump padding words
-+-mov r0, unif
-+-
-+ # submit texture requests for second line
-+ max r1, ra_y, 0
-+ min r1, r1, rb_frame_height_minus_1
-+-- 
-+2.5.0
-+
-+
-+From 2372b3e0797cfce130103357085d21baecb0d5a8 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Sat, 23 May 2015 13:20:21 +0100
-+Subject: [PATCH 45/68] Add new cache flushing routine
-+
-+---
-+ libavcodec/hevc.c          |  8 +++--
-+ libavcodec/hevc_filter.c   | 39 ++++++++++-----------
-+ libavcodec/rpi_qpu.c       | 17 +++++++--
-+ libavcodec/rpi_qpu.h       |  2 ++
-+ libavcodec/rpi_user_vcsm.h | 86 ++++++++++++++++++++++++++--------------------
-+ 5 files changed, 91 insertions(+), 61 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 25e1cbd..31bbf67 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3498,9 +3498,13 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
-+     }
-+ 
-+ fail:
-+-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
-++    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-++#ifdef RPI_INTER_QPU
-++        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-++        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
-++#endif
-+         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-+-
-++    }
-+     return ret;
-+ }
-+ 
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 186317a..ec84e8a 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -883,36 +883,35 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
-+   return p->vc & 0x3fffffff;
-+ }
-+ 
-+-static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-++void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-++void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+             s->nal_unit_type == NAL_TSA_N   ||
-+             s->nal_unit_type == NAL_STSA_N  ||
-+             s->nal_unit_type == NAL_RADL_N  ||
-+             s->nal_unit_type == NAL_RASL_N )) {
-+-#define RPI_FAST_CACHEFLUSH
-+ #ifdef RPI_FAST_CACHEFLUSH
-+         struct vcsm_user_clean_invalid_s iocache = {};
-+-        int curr_y = f->progress->data[0];
-++        int curr_y = ((int *)f->progress->data)[0];
-++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-++        int n_uv = n >> s->ps.sps->vshift[1];
-+         int sz,base;
-+-        if (curr_y < 0) curr_y = 0;
-+-        if (n<=curr_y) return; // Should not happen
-+-        sz = s->frame->linesize[1] * (n-curr_y);
-+-        base = s->frame->linesize[1] * curr_y;
-+-        iocache.s[0].cmd = 3; // Flush L1 cache
-+-        iocache.s[0].addr = 0;
-+-        iocache.s[0].size  = 0;
-+-
-+-        iocache.s[1].cmd = 2;
-+-        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
-++        if (curr_uv < 0) curr_uv = 0;
-++        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
-++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-++        base = s->frame->linesize[1] * curr_uv;
-++        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-++        iocache.s[0].handle = p->vcsm_handle;
-++        iocache.s[0].cmd = 3; // clean+invalidate
-++        iocache.s[0].addr = p->arm + base;
-++        iocache.s[0].size  = sz;
-++        p = av_buffer_pool_opaque(s->frame->buf[2]);
-++        iocache.s[1].handle = p->vcsm_handle;
-++        iocache.s[1].cmd = 3; // clean+invalidate
-++        iocache.s[1].addr = p->arm + base;
-+         iocache.s[1].size  = sz;
-+-
-+-        iocache.s[2].cmd = 2;
-+-        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
-+-        iocache.s[2].size  = sz;
-+-
-+-        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
-+-
-++        vcsm_clean_invalid( &iocache );
-+ #else
-+         flush_buffer(s->frame->buf[1]);
-+         flush_buffer(s->frame->buf[2]);
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index feb3284..aa65a77 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -211,6 +211,7 @@ static void gpu_unlock(void) {
-+ }
-+ 
-+ static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-++  p->numbytes = numbytes;
-+   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+   assert(p->vcsm_handle);
-+   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+@@ -243,13 +244,25 @@ int gpu_get_mailbox(void)
-+   return gpu->mb;
-+ }
-+ 
-++// Call this to clean and invalidate a region of memory
-+ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ {
-+-  void *tmp = vcsm_lock(p->vcsm_handle);
-+-  vcsm_unlock_ptr(tmp);
-++#define RPI_FAST_CACHEFLUSH
-++#ifdef RPI_FAST_CACHEFLUSH
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    iocache.s[0].handle = p->vcsm_handle;
-++    iocache.s[0].cmd = 3; // clean+invalidate
-++    iocache.s[0].addr = p->arm;
-++    iocache.s[0].size  = p->numbytes;
-++    vcsm_clean_invalid( &iocache );
-++#else
-++    void *tmp = vcsm_lock(p->vcsm_handle);
-++    vcsm_unlock_ptr(tmp);
-++#endif
-+ }
-+ 
-+ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-++  p->numbytes = numbytes;
-+   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 2f08f03..0565a60 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -1,6 +1,8 @@
-+ #ifndef RPI_QPU_H
-+ #define RPI_QPU_H
-+ 
-++#define RPI_FAST_CACHEFLUSH
-++
-+ typedef struct gpu_mem_ptr_s {
-+   unsigned char *arm; // Pointer to memory mapped on ARM side
-+   int vc_handle;   // Videocore handle of relocatable memory
-+diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-+index 95e6de1..db41a4d 100644
-+--- a/libavcodec/rpi_user_vcsm.h
-++++ b/libavcodec/rpi_user_vcsm.h
-+@@ -1,29 +1,41 @@
-+-/*
-+-Copyright (c) 2012, Broadcom Europe Ltd
-+-All rights reserved.
-+-
-+-Redistribution and use in source and binary forms, with or without
-+-modification, are permitted provided that the following conditions are met:
-+-    * Redistributions of source code must retain the above copyright
-+-      notice, this list of conditions and the following disclaimer.
-+-    * Redistributions in binary form must reproduce the above copyright
-+-      notice, this list of conditions and the following disclaimer in the
-+-      documentation and/or other materials provided with the distribution.
-+-    * Neither the name of the copyright holder nor the
-+-      names of its contributors may be used to endorse or promote products
-+-      derived from this software without specific prior written permission.
-+-
-+-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+-*/
-++/*****************************************************************************
-++* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
-++*
-++* This program is the proprietary software of Broadcom Corporation and/or
-++* its licensors, and may only be used, duplicated, modified or distributed
-++* pursuant to the terms and conditions of a separate, written license
-++* agreement executed between you and Broadcom (an "Authorized License").
-++* Except as set forth in an Authorized License, Broadcom grants no license
-++* (express or implied), right to use, or waiver of any kind with respect to
-++* the Software, and Broadcom expressly reserves all rights in and to the
-++* Software and all intellectual property rights therein.  IF YOU HAVE NO
-++* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-++* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-++* THE SOFTWARE.
-++*
-++* Except as expressly set forth in the Authorized License,
-++* 1. This program, including its structure, sequence and organization,
-++*    constitutes the valuable trade secrets of Broadcom, and you shall use
-++*    all reasonable efforts to protect the confidentiality thereof, and to
-++*    use this information only in connection with your use of Broadcom
-++*    integrated circuit products.
-++* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-++*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-++*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-++*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-++*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-++*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-++*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-++*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-++* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-++*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-++*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-++*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-++*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-++*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-++*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-++*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-++*****************************************************************************/
-+ 
-+ #ifndef __USER_VCSM__H__INCLUDED__
-+ #define __USER_VCSM__H__INCLUDED__
-+@@ -424,21 +436,21 @@ int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+ **
-+ ** structure contains a list of flush/invalidate commands. Commands are:
-+ ** 0: nop
-+-** 1: invalidate given physical range in L2
-+-** 2: clean      given physical range in L2
-+-** 3: clean+invalidate all of L1
-+-** 4: flush      all of L2 and all of L1
-++** 1: invalidate       given virtual range in L1/L2
-++** 2: clean            given virtual range in L1/L2
-++** 3: clean+invalidate given virtual range in L1/L2
-++** 4: flush all L1/L2
-+ */
-+ struct vcsm_user_clean_invalid_s {
-+-    struct {
-+-       unsigned int cmd;
-+-       unsigned int addr;
-+-       unsigned int size;
-+-    } s[8];
-++   struct {
-++      unsigned int cmd;
-++      unsigned int handle;
-++      unsigned int addr;
-++      unsigned int size;
-++   } s[8];
-+ };
-+ 
-+-int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
-+-
-++int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
-+ 
-+ #ifdef __cplusplus
-+ }
-+-- 
-+2.5.0
-+
-+
-+From 7ddf50b155ce8417e8b27735098b3651567f07e5 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Sat, 23 May 2015 21:10:10 +0100
-+Subject: [PATCH 46/68] Fix multi mailbox extra transform call
-+
-+---
-+ libavcodec/hevc.c | 2 ++
-+ 1 file changed, 2 insertions(+)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 31bbf67..f479707 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3011,7 +3011,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI_INTER_QPU
-+         rpi_execute_inter_qpu(s);
-+ #endif
-++#ifndef RPI_MULTI_MAILBOX
-+         rpi_execute_transform(s);
-++#endif
-+         rpi_execute_inter_cmds(s);
-+         vpu_wait(s->vpu_id);
-+         rpi_execute_pred_cmds(s);
-+-- 
-+2.5.0
-+
-+
-+From 9d16a24e225841b0ba09006edcd052ac2ccaf335 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 27 May 2015 16:44:29 +0100
-+Subject: [PATCH 47/68] Added support for running luma prediction on QPUs
-+
-+---
-+ libavcodec/hevc.c          |  237 +++++++-
-+ libavcodec/hevc.h          |   26 +-
-+ libavcodec/hevc_filter.c   |   23 +-
-+ libavcodec/rpi_qpu.c       |  156 ++++--
-+ libavcodec/rpi_qpu.h       |    8 +-
-+ libavcodec/rpi_shader.c    | 1313 ++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |   21 +-
-+ libavcodec/rpi_shader.qasm |  883 ++++++++++++++---------------
-+ 8 files changed, 1464 insertions(+), 1203 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index f479707..c6b619b 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -50,6 +50,11 @@
-+     // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-+     #define RPI_MULTI_MAILBOX
-+   #endif
-++
-++  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-++  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
-++
-++
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -72,6 +77,13 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+ 
-++// Split image of 2048 into parts 64 wide
-++// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
-++// Each block of 64*64
-++// Smallest CTU size is 16x16, so smallest block is 8x8
-++// Corresponds to a total of 83kbytes over all 12 QPUs
-++#define RPI_LUMA_COMMAND_WORDS 9
-++#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
-+ 
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+@@ -2002,10 +2014,46 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-++#ifdef RPI_LUMA_QPU
-++        if (s->enable_rpi) {
-++            int reflist = 0;
-++            const Mv *mv         = &current_mv.mv[reflist];
-++            int mx          = mv->x & 3;
-++            int my          = mv->y & 3;
-++            int my_mx = (my<<8) + mx;
-++            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-++            int x1 = x0 + (mv->x >> 2);
-++            int y1 = y0 + (mv->y >> 2);
-++            int chan = x0>>6; // 64 wide blocks per QPU
-++            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-++            uint32_t *y = s->y_mvs[chan % 12];
-++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-++              for(int start_x=0;start_x < nPbW;start_x+=16) {
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-++                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = my2_mx2_my_mx;
-++                  if (weight_flag) {
-++                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-++                  } else {
-++                      *y++ = 1; // Weight of 1 and offset of 0
-++                  }
-++                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-++                }
-++            }
-++            s->y_mvs[chan % 12] = y;
-++        } else
-++#endif
-++        {
-++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-+                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
-+                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-+                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
-++        }
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+ #ifdef RPI_INTER_QPU
-+@@ -2065,10 +2113,47 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-++#ifdef RPI_LUMA_QPU
-++        if (s->enable_rpi) {
-++            int reflist = 1;
-++            const Mv *mv    = &current_mv.mv[reflist];
-++            int mx          = mv->x & 3;
-++            int my          = mv->y & 3;
-++            int my_mx = (my<<8) + mx;
-++            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-++            int x1 = x0 + (mv->x >> 2);
-++            int y1 = y0 + (mv->y >> 2);
-++            int chan = x0>>6; // 64 wide blocks per QPU
-++            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-++            uint32_t *y = s->y_mvs[chan % 12];
-++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-++              for(int start_x=0;start_x < nPbW;start_x+=16) {
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-++                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = my2_mx2_my_mx;
-++                  if (weight_flag) {
-++                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-++                  } else {
-++                      *y++ = 1; // Weight of 1 and offset of 0
-++                  }
-++                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-++                }
-++            }
-++            s->y_mvs[chan % 12] = y;
-++        } else
-++#endif
-++
-++        {
-++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-+                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
-+                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-+                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
-++        }
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+ #ifdef RPI_INTER_QPU
-+@@ -2102,8 +2187,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-+                       if (weight_flag) {
-+-                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
-+-                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][0] & 0xffff);
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][1] & 0xffff);
-+                       } else {
-+                           *u++ = 1; // Weight of 1 and offset of 0
-+                           *u++ = 1;
-+@@ -2130,9 +2215,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-++#ifdef RPI_LUMA_QPU
-++        if (s->enable_rpi) {
-++            const Mv *mv    = &current_mv.mv[0];
-++            int mx          = mv->x & 3;
-++            int my          = mv->y & 3;
-++            int my_mx = (my<<8) + mx;
-++            const Mv *mv2    = &current_mv.mv[1];
-++            int mx2          = mv2->x & 3;
-++            int my2          = mv2->y & 3;
-++            int my2_mx2 = (my2<<8) + mx2;
-++            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
-++            int x1 = x0 + (mv->x >> 2);
-++            int y1 = y0 + (mv->y >> 2);
-++            int x2 = x0 + (mv2->x >> 2);
-++            int y2 = y0 + (mv2->y >> 2);
-++            int chan = x0>>6; // 64 wide blocks per QPU
-++            uint32_t *y = s->y_mvs[chan % 12];
-++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-++              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-++                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = my2_mx2_my_mx;
-++                  *y++ = 1; // B frame weighted prediction not supported
-++                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-++                }
-++            }
-++            s->y_mvs[chan % 12] = y;
-++        } else
-++#endif
-++        {
-++            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-+                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
-+                    ref1->frame, &current_mv.mv[1], &current_mv);
-++        }
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+ #ifdef RPI_INTER_QPU
-+@@ -2821,7 +2941,6 @@ static void rpi_inter_clear(HEVCContext *s)
-+         *s->u_mvs[i]++ = pic_height;
-+         *s->u_mvs[i]++ = s->frame->linesize[1];
-+         *s->u_mvs[i]++ = s->frame->linesize[2];
-+-        *s->u_mvs[i]++ = i;
-+         if (weight_flag) {
-+             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-+             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-+@@ -2829,7 +2948,31 @@ static void rpi_inter_clear(HEVCContext *s)
-+             *s->u_mvs[i]++ = 1 << 5;
-+             *s->u_mvs[i]++ = 6;
-+         }
-++        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-++    }
-++
-++#ifdef RPI_LUMA_QPU
-++    for(i=0;i<12;i++) {
-++        s->y_mvs[i] = s->y_mvs_base[i];
-++        *s->y_mvs[i]++ = 0; // y_x
-++        *s->y_mvs[i]++ = 0; // ref_y_base
-++        *s->y_mvs[i]++ = 0; // y2_x2
-++        *s->y_mvs[i]++ = 0; // ref_y2_base
-++        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-++        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
-++        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
-++        if (weight_flag) {
-++            int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
-++            int shift = s->sh.luma_log2_weight_denom + 6;
-++            *s->y_mvs[i]++ = (offset << 16) + shift;
-++        } else {
-++            int offset = 1 << 5;
-++            int shift = 6;
-++            *s->y_mvs[i]++ = (offset << 16) + shift;
-++        }
-++        *s->y_mvs[i]++ = 0; // Next kernel
-+     }
-++#endif
-+ }
-+ 
-+ static void rpi_execute_inter_qpu(HEVCContext *s)
-+@@ -2837,6 +2980,9 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+     int k;
-+     int i;
-+     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-++#ifdef RPI_LUMA_QPU
-++    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
-++#endif
-+     if (s->sh.slice_type == I_SLICE) {
-+ #ifdef RPI_MULTI_MAILBOX
-+       rpi_execute_transform(s);
-+@@ -2852,8 +2998,23 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ 
-++#ifdef RPI_LUMA_QPU
-++    for(k=0;k<12;k++) {
-++        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-++        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-++    }
-++    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++#endif
-++
-++
-+ #ifdef RPI_MULTI_MAILBOX
-++#ifdef RPI_CACHE_UNIF_MVS
-++    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
-++#else
-+     gpu_cache_flush(&s->coeffs_buf_accelerated);
-++#endif
-+     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+@@ -2863,7 +3024,27 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+                                    (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++#ifdef RPI_LUMA_QPU
-++                                   qpu_get_fn(QPU_MC_SETUP),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
-++#else
-++                                   0,
-++                                   0,0,0,0,
-++                                   0,0,0,0,
-++                                   0,0,0,0
-++#endif
-+                                  );
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+@@ -2879,6 +3060,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+       );
-+ #endif
-++
-++
-+ }
-+ #endif
-+ 
-+@@ -3502,8 +3685,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
-+ fail:
-+     if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+ #ifdef RPI_INTER_QPU
-+-        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-+-        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
-++        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-+     }
-+@@ -3690,7 +3872,6 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+ 
-+ #ifdef RPI
-+     av_freep(&s->unif_mv_cmds);
-+-    av_freep(&s->unif_xfm_cmds);
-+     av_freep(&s->univ_pred_cmds);
-+ 
-+ #ifdef RPI_INTER_QPU
-+@@ -3699,7 +3880,12 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+         s->unif_mvs = 0;
-+     }
-+ #endif
-+-    //gpu_free(&s->dummy);
-++#ifdef RPI_LUMA_QPU
-++    if (s->y_unif_mvs) {
-++        gpu_free( &s->y_unif_mvs_ptr );
-++        s->y_unif_mvs = 0;
-++    }
-++#endif
-+ 
-+ #ifdef EARLY_MALLOC
-+     printf("hevc_decode_free\n");
-+@@ -3789,9 +3975,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+     if (!s->unif_mv_cmds)
-+         goto fail;
-+-    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
-+-    if (!s->unif_xfm_cmds)
-+-        goto fail;
-+     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+@@ -3805,7 +3988,11 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     {
-+         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+         uint32_t *p;
-++#ifdef RPI_CACHE_UNIF_MVS
-++        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++#else
-+         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++#endif
-+         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-+ 
-+         // Set up initial locations for uniform streams
-+@@ -3820,6 +4007,28 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+ 
-+     }
-+ #endif
-++#ifdef RPI_LUMA_QPU
-++    {
-++        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-++        uint32_t *p;
-++#ifdef RPI_CACHE_UNIF_MVS
-++        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-++#else
-++        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-++#endif
-++        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-++
-++        // Set up initial locations for uniform streams
-++        p = s->y_unif_mvs;
-++        for(i = 0; i < 12; i++) {
-++            s->y_mvs_base[i] = p;
-++            p += y_commands_per_qpu;
-++        }
-++        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-++        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
-++
-++    }
-++#endif
-+     //gpu_malloc_uncached(2048*64,&s->dummy);
-+ 
-+ #ifdef EARLY_MALLOC
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 3511982..33dedf7 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -43,9 +43,13 @@
-+ #ifdef RPI
-+ 
-+   #include "rpi_qpu.h"
-+-  // Use QPU for inter prediction
-++  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
-+   #define RPI_INTER_QPU
-+ 
-++  #ifdef RPI_INTER_QPU
-++    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-++    #define RPI_LUMA_QPU
-++  #endif
-+ #endif
-+ 
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+@@ -835,7 +839,6 @@ typedef struct HEVCLocalContext {
-+ 
-+ // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-+ #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
-+-#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-+ // Each block can have an intra prediction and a transform_add command
-+ #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-+ // Worst case is 16x16 CTUs
-+@@ -870,9 +873,6 @@ typedef struct HEVCMvCmd {
-+     int8_t ref_idx[2];
-+ } HEVCMvCmd;
-+ 
-+-// Command for transform to process a block of coefficients
-+-typedef struct HEVCXfmCmd {
-+-} HEVCXfmCmd;
-+ 
-+ // Command for intra prediction and transform_add of predictions to coefficients
-+ #define RPI_PRED_TRANSFORM_ADD 0
-+@@ -918,8 +918,7 @@ typedef struct HEVCContext {
-+ 
-+ #ifdef RPI
-+     int enable_rpi;
-+-    HEVCMvCmd *unif_mv_cmds;  // TODO rename
-+-    HEVCXfmCmd *unif_xfm_cmds;
-++    HEVCMvCmd *unif_mv_cmds;
-+     HEVCPredCmd *univ_pred_cmds;
-+     int buf_width;
-+     GPU_MEM_PTR_T coeffs_buf_default;
-+@@ -946,6 +945,15 @@ typedef struct HEVCContext {
-+     uint32_t mc_filter_uv_b0;
-+     uint32_t mc_filter_uv_b;
-+ #endif
-++#ifdef RPI_LUMA_QPU
-++    GPU_MEM_PTR_T y_unif_mvs_ptr;
-++    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
-++    uint32_t *y_mvs_base[12];
-++    uint32_t *y_mvs[12];
-++    // Function pointers
-++    uint32_t mc_filter;
-++    uint32_t mc_filter_b;
-++#endif
-+ 
-+ #endif
-+ 
-+@@ -1181,6 +1189,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                                  int log2_trafo_size, enum ScanType scan_idx,
-+                                  int c_idx);
-+ 
-++#ifdef RPI_INTER_QPU
-++extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
-++#endif
-++
-+ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
-+ 
-+ 
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index ec84e8a..11629e4 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -883,8 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
-+   return p->vc & 0x3fffffff;
-+ }
-+ 
-+-void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-+-void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-++void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+             s->nal_unit_type == NAL_TSA_N   ||
-+@@ -911,10 +910,24 @@ void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-+         iocache.s[1].cmd = 3; // clean+invalidate
-+         iocache.s[1].addr = p->arm + base;
-+         iocache.s[1].size  = sz;
-++
-++#ifdef RPI_LUMA_QPU
-++        p = av_buffer_pool_opaque(s->frame->buf[0]);
-++        sz = s->frame->linesize[0] * (n-curr_y);
-++        base = s->frame->linesize[0] * curr_y;
-++        iocache.s[2].handle = p->vcsm_handle;
-++        iocache.s[2].cmd = 3; // clean+invalidate
-++        iocache.s[2].addr = p->arm + base;
-++        iocache.s[2].size  = sz;
-++#endif
-+         vcsm_clean_invalid( &iocache );
-+ #else
-+         flush_buffer(s->frame->buf[1]);
-+         flush_buffer(s->frame->buf[2]);
-++#ifdef RPI_LUMA_QPU
-++        flush_buffer(s->frame->buf[1]);
-++#endif
-++
-+ #endif
-+         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+@@ -938,7 +951,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x, y - ctb_size);
-+             if (s->threads_type & FF_THREAD_FRAME ) {
-+ #ifdef RPI_INTER_QPU
-+-                ff_hevc_flush_chroma(s,&s->ref->tf, y);
-++                ff_hevc_flush_buffer(s,&s->ref->tf, y);
-+ #endif
-+                 ff_thread_report_progress(&s->ref->tf, y, 0);
-+             }
-+@@ -947,7 +960,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x , y);
-+             if (s->threads_type & FF_THREAD_FRAME ) {
-+ #ifdef RPI_INTER_QPU
-+-                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
-++                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
-+ #endif
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+             }
-+@@ -957,7 +970,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-+ #ifdef RPI_INTER_QPU
-+-        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
-++        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index aa65a77..e12304b 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -1,9 +1,11 @@
-+ #ifdef RPI
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+ // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+-#define RPI_TIME_TOTAL_QPU
-++//#define RPI_TIME_TOTAL_QPU
-+ // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+ //#define RPI_TIME_TOTAL_VPU
-++// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
-++//#define RPI_TIME_TOTAL_POSTED
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+ #define RPI_ASYNC
-+ 
-+@@ -94,7 +96,8 @@ struct GPU
-+   int open_count; // Number of allocated video buffers
-+   int      mb; // Mailbox handle
-+   int      vc; // Address in GPU memory
-+-  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-++  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
-++  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
-+ };
-+ 
-+ // Stop more than one thread trying to allocate memory or use the processing resources at once
-+@@ -102,7 +105,7 @@ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ static volatile struct GPU* gpu = NULL;
-+ static GPU_MEM_PTR_T gpu_mem_ptr;
-+ 
-+-#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
-++#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
-+ static unsigned int Microseconds(void) {
-+     struct timespec ts;
-+     unsigned int x;
-+@@ -123,7 +126,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
-+ static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
-+ static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ 
-+-static int vpu_cmds[MAXCMDS][16];
-++static int vpu_cmds[MAXCMDS][32];
-+ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-+ static volatile int vpu_async_head=0;
-+ #endif
-+@@ -247,7 +250,6 @@ int gpu_get_mailbox(void)
-+ // Call this to clean and invalidate a region of memory
-+ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ {
-+-#define RPI_FAST_CACHEFLUSH
-+ #ifdef RPI_FAST_CACHEFLUSH
-+     struct vcsm_user_clean_invalid_s iocache = {};
-+     iocache.s[0].handle = p->vcsm_handle;
-+@@ -261,6 +263,34 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ #endif
-+ }
-+ 
-++void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-++{
-++#ifdef RPI_FAST_CACHEFLUSH
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    iocache.s[0].handle = p0->vcsm_handle;
-++    iocache.s[0].cmd = 3; // clean+invalidate
-++    iocache.s[0].addr = (int) p0->arm;
-++    iocache.s[0].size  = p0->numbytes;
-++    iocache.s[1].handle = p1->vcsm_handle;
-++    iocache.s[1].cmd = 3; // clean+invalidate
-++    iocache.s[1].addr = (int) p1->arm;
-++    iocache.s[1].size  = p1->numbytes;
-++    iocache.s[2].handle = p2->vcsm_handle;
-++    iocache.s[2].cmd = 3; // clean+invalidate
-++    iocache.s[2].addr = (int) p2->arm;
-++    iocache.s[2].size  = p2->numbytes;
-++    vcsm_clean_invalid( &iocache );
-++#else
-++    void *tmp;
-++    tmp = vcsm_lock(p0->vcsm_handle);
-++    vcsm_unlock_ptr(tmp);
-++    tmp = vcsm_lock(p1->vcsm_handle);
-++    vcsm_unlock_ptr(tmp);
-++    tmp = vcsm_lock(p2->vcsm_handle);
-++    vcsm_unlock_ptr(tmp);
-++#endif
-++}
-++
-+ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+   p->numbytes = numbytes;
-+   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+@@ -357,9 +387,19 @@ unsigned int vpu_get_constants(void) {
-+ #ifdef RPI_ASYNC
-+ 
-+ static void *vpu_start(void *arg) {
-++#ifdef RPI_TIME_TOTAL_POSTED
-++  int last_time=0;
-++  long long on_time=0;
-++  long long off_time=0;
-++  int start_time;
-++  int end_time;
-++  int count=0;
-++#endif
-+   while(1) {
-++    int i;
-+     int *p;
-+     int qpu_code;
-++    int qpu_codeb;
-+     pthread_mutex_lock(&post_mutex);
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+@@ -373,24 +413,49 @@ static void *vpu_start(void *arg) {
-+       break; // Last job
-+     }
-+     qpu_code = p[7];
-++    qpu_codeb = p[16];
-+     //if (p[7]) {
-+         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+         //gpu_cache_flush(buf);
-+     //}
-++
-++#ifdef RPI_TIME_TOTAL_POSTED
-++    start_time = Microseconds();
-++    if (last_time==0)
-++      last_time = start_time;
-++    off_time += start_time-last_time;
-++#endif
-++
-+     if (!qpu_code) {
-+       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+     } else {
-+-      int i;
-+       for(i=0;i<8;i++) {
-+         gpu->mail[i*2] = p[8+i];
-+         gpu->mail[i*2 + 1] = qpu_code;
-+       }
-+-
-+-      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+-                              0, 0, 0, 0,
-++      for(i=0;i<12;i++) {
-++        gpu->mail2[i*2] = p[17+i];
-++        gpu->mail2[i*2 + 1] = qpu_codeb;
-++      }
-++#if (0)
-++      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-++      execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
-++#else
-++      execute_multi(gpu->mb,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-++#endif
-+     }
-++#ifdef RPI_TIME_TOTAL_POSTED
-++    end_time = Microseconds();
-++    last_time = end_time;
-++    on_time += end_time - start_time;
-++    count++;
-++    if ((count&0x7f)==0)
-++      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+     pthread_cond_broadcast(&post_cond_head);
-+@@ -436,7 +501,9 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
-+ }
-+ 
-+ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+-                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-++                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
-++                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
-++                      )
-+ {
-+ 
-+   pthread_mutex_lock(&post_mutex);
-+@@ -464,6 +531,21 @@ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2,
-+     p[13] = unifs6;
-+     p[14] = unifs7;
-+     p[15] = unifs8;
-++
-++    p[16] = qpu_codeb;
-++    p[17] = unifs1b;
-++    p[18] = unifs2b;
-++    p[19] = unifs3b;
-++    p[20] = unifs4b;
-++    p[21] = unifs5b;
-++    p[22] = unifs6b;
-++    p[23] = unifs7b;
-++    p[24] = unifs8b;
-++    p[25] = unifs9b;
-++    p[26] = unifs10b;
-++    p[27] = unifs11b;
-++    p[28] = unifs12b;
-++
-+     if (num<=1)
-+       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-+     pthread_mutex_unlock(&post_mutex);
-+@@ -544,27 +626,27 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
-+   off_time += start_time-last_time;
-+ #endif
-+   for(i=0;i<num;i++) {
-+-    gpu->mail[i*2 + 1] = code;
-++    gpu->mail2[i*2 + 1] = code;
-+   }
-+   for(;i<num+num2;i++) {
-+-    gpu->mail[i*2 + 1] = code2;
-++    gpu->mail2[i*2 + 1] = code2;
-+   }
-+-  gpu->mail[0 ] = unifs1;
-+-  gpu->mail[2 ] = unifs2;
-+-  gpu->mail[4 ] = unifs3;
-+-  gpu->mail[6 ] = unifs4;
-+-  gpu->mail[8 ] = unifs5;
-+-  gpu->mail[10] = unifs6;
-+-	gpu->mail[12] = unifs7;
-+-	gpu->mail[14] = unifs8;
-+-	gpu->mail[16] = unifs9;
-+-	gpu->mail[18] = unifs10;
-+-	gpu->mail[20] = unifs11;
-+-	gpu->mail[22] = unifs12;
-++  gpu->mail2[0 ] = unifs1;
-++  gpu->mail2[2 ] = unifs2;
-++  gpu->mail2[4 ] = unifs3;
-++  gpu->mail2[6 ] = unifs4;
-++  gpu->mail2[8 ] = unifs5;
-++  gpu->mail2[10] = unifs6;
-++	gpu->mail2[12] = unifs7;
-++	gpu->mail2[14] = unifs8;
-++	gpu->mail2[16] = unifs9;
-++	gpu->mail2[18] = unifs10;
-++	gpu->mail2[20] = unifs11;
-++	gpu->mail2[22] = unifs12;
-+ 	execute_qpu(
-+ 		gpu->mb,
-+ 		12 /* Number of QPUs */,
-+-		gpu->vc + offsetof(struct GPU, mail),
-++		gpu->vc + offsetof(struct GPU, mail2),
-+ 		1 /* no flush */,  // Don't flush VPU L1 cache
-+ 		5000 /* timeout ms */);
-+ #ifdef RPI_TIME_TOTAL_QPU
-+@@ -635,21 +717,21 @@ unsigned int qpu_get_fn(int num) {
-+       gpu_unlock();
-+     }
-+     switch(num) {
-+-    //case QPU_MC_SETUP:
-+-    //  fn = mc_setup;
-+-    //  break;
-+-    //case QPU_MC_FILTER:
-+-    //  fn = mc_filter;
-+-    //  break;
-++    case QPU_MC_SETUP:
-++      fn = mc_setup;
-++      break;
-++    case QPU_MC_FILTER:
-++      fn = mc_filter;
-++      break;
-+     case QPU_MC_EXIT:
-+       fn = mc_exit;
-+       break;
-+-    //case QPU_MC_INTERRUPT_EXIT:
-+-    //  fn = mc_interrupt_exit;
-+-    //  break;
-+-    //case QPU_MC_FILTER_B:
-+-    //  fn = mc_filter_b;
-+-    //  break;
-++    case QPU_MC_INTERRUPT_EXIT12:
-++      fn = mc_interrupt_exit12;
-++      break;
-++    case QPU_MC_FILTER_B:
-++      fn = mc_filter_b;
-++      break;
-+     //case QPU_MC_FILTER_HONLY:
-+     //  fn = mc_filter_honly;
-+     //  break;
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 0565a60..81c2bb1 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -1,6 +1,7 @@
-+ #ifndef RPI_QPU_H
-+ #define RPI_QPU_H
-+ 
-++// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
-+ #define RPI_FAST_CACHEFLUSH
-+ 
-+ typedef struct gpu_mem_ptr_s {
-+@@ -16,6 +17,7 @@ extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+ extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+ extern void gpu_free(GPU_MEM_PTR_T *p);
-+ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-++extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-+ 
-+ // QPU specific functions
-+ extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+@@ -26,7 +28,7 @@ enum {
-+   QPU_MC_SETUP,
-+   QPU_MC_FILTER,
-+   QPU_MC_EXIT,
-+-  QPU_MC_INTERRUPT_EXIT,
-++  QPU_MC_INTERRUPT_EXIT12,
-+   QPU_MC_FILTER_B,
-+   QPU_MC_FILTER_HONLY,
-+   QPU_MC_SETUP_UV,
-+@@ -44,7 +46,9 @@ extern unsigned int vpu_get_constants(void);
-+ extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+ extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-+ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+-                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-++                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
-++                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
-++                      );
-+ extern void vpu_wait( int id);
-+ 
-+ // Simple test of shader code
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index a0f0282..e86eb30 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -48,693 +48,674 @@ unsigned int rpi_shader[] = {
-+ /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+ /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+ /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
-+-/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-+-/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-+-/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+-/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+-/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+-/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+-/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+-/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-++/* [0x000000d0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-++/* [0x000000d8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000000e0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-++/* [0x000000e8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000000f0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000000f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000100] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000108] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-++/* [0x00000110] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000118] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-++/* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-++/* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
-++/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
-++/* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
-++/* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
-++/* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x00000158] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000160] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000168] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000170] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000178] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000180] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000188] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000190] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000198] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x000001a0] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
-++/* [0x000001a8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x000001b0] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
-++/* [0x000001b8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x000001c0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x000001c8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x000001d0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+ /* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+-/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+-/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x000001e0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x000001e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000001f0] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x000001f8] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-++/* [0x00000200] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
-+ // ::mc_filter_uv
-+-/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+-/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+-/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000208] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000210] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000230] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000238] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000240] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000248] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000250] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000260] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000268] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000270] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000278] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000280] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000288] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000290] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000298] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002a0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002a8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002b0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000002b8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002c8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000002d0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000002d8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000002e0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000002e8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000002f0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000002f8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000300] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000308] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000310] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000318] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000320] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000328] */ 0x0f9e7080, 0x100208e7, // asr r3, r0, r2
-++/* [0x00000330] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000338] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-++/* [0x00000340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000348] */ 0x0f9e7080, 0x100608e7, // asr.ifnz r3, r0, r2
-++/* [0x00000350] */ 0x119c87c0, 0xd00213a7, // shl rb14,r3,8
-++/* [0x00000358] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
-+-/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
-+-/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+-/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+-/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000360] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000368] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-++/* [0x00000370] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000378] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000380] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000003a8] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-++/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000003c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000003d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000003d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000003e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000003e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000003f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000003f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000400] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000408] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000410] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000418] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000420] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000428] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000430] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000438] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000440] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000448] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000450] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000458] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000460] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x00000468] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x00000470] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x00000478] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000480] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00000488] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000490] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000498] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004b0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004c0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004c8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000004e0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000004e8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000004f0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000004f8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000500] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000508] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000510] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000518] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000520] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000528] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000530] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000538] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000540] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000548] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000550] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000558] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000560] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000568] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000570] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000578] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000580] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000588] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000590] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000598] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005d8] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000005e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005e8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000005f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000600] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000610] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-++/* [0x00000618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x00000650] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-++/* [0x00000658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000660] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000668] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000670] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000680] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000690] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006a0] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x000006a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x000006b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000006c8] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000006d0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000006d8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000006e0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000006e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000006f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000006f8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000700] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000708] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000710] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000720] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000728] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000740] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000748] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000750] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000758] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000760] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000768] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000770] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000778] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000780] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000788] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000790] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000798] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000007b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000007c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000007c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000007d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000007d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000007e0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000007e8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000007f0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x000007f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000800] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000808] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000810] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000818] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000820] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000828] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000830] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000838] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000840] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000848] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000850] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000858] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000860] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000890] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-++/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000008d0] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-++/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a18] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a28] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a58] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a68] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_setup
-+-/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+-/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+-/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+-/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-+-/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+-/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+-/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-+-/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+-/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+-/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-+-/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000ac0] */ 0x00000010, 0xe00208e7, // mov r3, 16
-++/* [0x00000ac8] */ 0x15827d80, 0x10020227, // mov ra8, unif
-++/* [0x00000ad0] */ 0x15827d80, 0x10020267, // mov ra9, unif
-++/* [0x00000ad8] */ 0x15827d80, 0x100202a7, // mov ra10, unif
-++/* [0x00000ae0] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-++/* [0x00000ae8] */ 0x15827d80, 0x10020867, // mov r1, unif
-++/* [0x00000af0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000af8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000b00] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000b08] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
-++/* [0x00000b10] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
-++/* [0x00000b18] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-++/* [0x00000b20] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b28] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000b30] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000b38] */ 0x15227d80, 0x10020867, // mov r1, ra8
-++/* [0x00000b40] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000b48] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000b50] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000b58] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-++/* [0x00000b60] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-++/* [0x00000b68] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
-++/* [0x00000b70] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000b78] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00000b80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000b88] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000b90] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000b98] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000ba0] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000ba8] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-++/* [0x00000bb0] */ 0x152a7d80, 0x10020867, // mov r1, ra10
-++/* [0x00000bb8] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000bc0] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000bc8] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000bd0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-++/* [0x00000bd8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-++/* [0x00000be0] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
-++/* [0x00000be8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000bf0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-++/* [0x00000bf8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000c00] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000c08] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000c10] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000c18] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000c20] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
-++/* [0x00000c28] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000c30] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000c38] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000c40] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000c48] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000c50] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000c58] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000c60] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00000c68] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00000c70] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00000c78] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00000c80] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00000c88] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00000c90] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00000c98] */ 0x00004000, 0xe00204a7, // mov ra18, 0x4000
-++/* [0x00000ca0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000ca8] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000cb0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000cb8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000cc0] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000cc8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000cd8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000ce0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000ce8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000cf0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000cf8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000d00] */ 0x15827d80, 0x10020867, // mov r1, unif
-++/* [0x00000d08] */ 0x919c82ff, 0xd0024822, // shl r0,r1,r3 ; mov r2,8
-++/* [0x00000d10] */ 0x0f9e70c0, 0x10021367, // asr rb13,r0,r3
-++/* [0x00000d18] */ 0x0f9e72c0, 0x10021327, // asr rb12,r1,r3
-++/* [0x00000d20] */ 0x0c9cde80, 0x10021367, // add rb13,rb13,r2
-++/* [0x00000d28] */ 0x119cce80, 0x10021327, // shl rb12, rb12, r2
-++/* [0x00000d30] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000d38] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d40] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000d48] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d50] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x00000d58] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-++/* [0x00000d60] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+ /* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-+ /* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-+-// ::mc_filter
-++/* [0x00000d78] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
-++// :per_block_setup
-+ /* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ /* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+ /* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+ /* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+-/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+-/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+-/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+-/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+-/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000da0] */ 0x00000010, 0xe00208e7, // mov r3, 16
-++/* [0x00000da8] */ 0x15827d80, 0x10020867, // mov r1, unif
-++/* [0x00000db0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000db8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000dc0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000dc8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-++/* [0x00000dd0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-++/* [0x00000dd8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000de0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000de8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000df0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000df8] */ 0x8c827436, 0x100246a1, // add ra_frame_base_next, r2, r0 ; mov r1, unif
-++/* [0x00000e00] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000e08] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000e10] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000e18] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-++/* [0x00000e20] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-++/* [0x00000e28] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000e30] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000e38] */ 0x159e7240, 0x10021067, // mov ra_y2_next, r1
-++/* [0x00000e40] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000e48] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x00000e50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e58] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e60] */ 0x0e9e70c0, 0x10020867, // shr r1, r0, r3
-++/* [0x00000e68] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000e70] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000e78] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000e80] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000e88] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000e90] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000e98] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
-++/* [0x00000ea0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000ea8] */ 0x95801dbf, 0xd0024821, // mov r0, unif ; mov r1,1
-++/* [0x00000eb0] */ 0x4f5971c6, 0x10024260, // asr ra9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000eb8] */ 0x4f5971c6, 0x10024220, // asr ra8, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ec0] */ 0x4f5971c6, 0x10044260, // asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22
-++/* [0x00000ec8] */ 0x0f9d71c0, 0x10040227, // asr.ifz ra8, r0, rb23
-++/* [0x00000ed0] */ 0x0d243f80, 0xd0020267, // sub ra9,3,ra9
-++/* [0x00000ed8] */ 0x0d203f80, 0xd0020227, // sub ra8,3,ra8
-++/* [0x00000ee0] */ 0x11243dc0, 0xd0020267, // shl ra9,ra9,3
-++/* [0x00000ee8] */ 0x11203dc0, 0xd0020227, // shl ra8,ra8,3
-++/* [0x00000ef0] */ 0x00ffff00, 0xe0020867, // mov r1,0xffff00
-++/* [0x00000ef8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000f00] */ 0x0f9d71c0, 0x10020027, // asr ra0, r0, rb23
-++/* [0x00000f08] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+ /* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+-/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000f18] */ 0x01040400, 0xe0020867, // mov r1,0x1040400
-++/* [0x00000f20] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000f28] */ 0x0f9d71c0, 0x10020067, // asr ra1, r0, rb23
-++/* [0x00000f30] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000f38] */ 0x0f9d71c0, 0x10021167, // asr rb5, r0, rb23
-++/* [0x00000f40] */ 0xfbf5f600, 0xe0020867, // mov r1,0xfbf5f600
-++/* [0x00000f48] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000f50] */ 0x0f9d71c0, 0x100200a7, // asr ra2, r0, rb23
-++/* [0x00000f58] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000f60] */ 0x0f9d71c0, 0x100211a7, // asr rb6, r0, rb23
-++/* [0x00000f68] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-++/* [0x00000f70] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000f78] */ 0x0f9d71c0, 0x100200e7, // asr ra3, r0, rb23
-++/* [0x00000f80] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000f88] */ 0x0f9d71c0, 0x100211e7, // asr rb7, r0, rb23
-++/* [0x00000f90] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-++/* [0x00000f98] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000fa0] */ 0x0f9d71c0, 0x10020127, // asr ra4, r0, rb23
-++/* [0x00000fa8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000fb0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000fb8] */ 0xf6f5fb00, 0xe0020867, // mov r1,0xf6f5fb00
-++/* [0x00000fc0] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000fc8] */ 0x0f9d71c0, 0x10020167, // asr ra5, r0, rb23
-++/* [0x00000fd0] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000fd8] */ 0x0f9d71c0, 0x10021267, // asr rb9, r0, rb23
-++/* [0x00000fe0] */ 0x04040100, 0xe0020867, // mov r1,0x4040100
-++/* [0x00000fe8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000ff0] */ 0x0f9d71c0, 0x100201a7, // asr ra6, r0, rb23
-++/* [0x00000ff8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00001000] */ 0x0f9d71c0, 0x100212a7, // asr rb10, r0, rb23
-++/* [0x00001008] */ 0xffff0000, 0xe0020867, // mov r1,0xffff0000
-++/* [0x00001010] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00001018] */ 0x0f9d71c0, 0x100201e7, // asr ra7, r0, rb23
-++/* [0x00001020] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00001028] */ 0x0f9d71c0, 0x100212e7, // asr rb11, r0, rb23
-++/* [0x00001030] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001038] */ 0x0f9e70c0, 0x100213e7, // asr rb15, r0, r3
-++/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001048] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
-++/* [0x00001050] */ 0x8f9c00ff, 0xd0024823, // asr r0, r0, r3 ; mov r3, 0
-++/* [0x00001058] */ 0x119c81c0, 0xd00213a7, // shl rb14, r0, 8
-++// ::mc_filter
-+ // :yloop
-+-/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+-/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+-/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+-/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+-/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+-/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+-/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-+-/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-+-/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00001060] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00001068] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-++/* [0x00001070] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00001078] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001080] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00001088] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00001090] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001098] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000010a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x000010a8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x000010b0] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x000010b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000010c0] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x000010c8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-++/* [0x000010d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000010d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000010e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000010e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000010f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000010f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00001100] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001108] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001110] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001118] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001120] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001128] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001130] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001138] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001140] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001148] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001150] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001158] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001160] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
-++/* [0x00001168] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001170] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001178] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001180] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001188] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001190] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001198] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000011a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000011a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000011b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000011b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000011c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000011c8] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000011d0] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000011d8] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000011e0] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000011e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000011f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000011f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00001200] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x00001208] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x00001210] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x00001218] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001220] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00001228] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001230] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001238] */ 0xfffffb28, 0xf0f809e7, // brr -, r:per_block_setup
-++/* [0x00001240] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001248] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001250] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_b
-+-/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+-/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+-/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+-/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+-/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+-/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
-+-/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-+-/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-+-/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-+-/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-+-/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-+-/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-+-/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-+-/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+-/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :yloopb
-+-/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+-/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+-/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+-/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+-/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+-/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+-/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-+-/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-+-/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-+-/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-+-/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00001258] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00001260] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-++/* [0x00001268] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00001270] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001278] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00001280] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00001288] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001290] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001298] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x000012a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x000012a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x000012b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000012b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x000012c0] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-++/* [0x000012c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000012d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000012d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000012e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000012e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000012f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000012f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001300] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001308] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001310] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001318] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001320] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001328] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001330] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001338] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001340] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001348] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001350] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001358] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
-++/* [0x00001360] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001368] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001370] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001378] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001380] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001388] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001398] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000013a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000013a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000013b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000013b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000013c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000013c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000013d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000013d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000013e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000013e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000013f0] */ 0x0f9ce3c0, 0xd0020827, // asr r0, r1, 14
-++/* [0x000013f8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00001400] */ 0x405b8006, 0xd00049e0, // nop                     ; mul24 r0, r0 << 8, ra22 << 8
-++/* [0x00001408] */ 0x0c4a7380, 0x10020867, // add r1, r1, ra18
-++/* [0x00001410] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00001418] */ 0xfffffe20, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001420] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00001428] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001430] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001438] */ 0xfffff928, 0xf0f809e7, // brr -, r:per_block_setup
-++/* [0x00001440] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001448] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001450] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_interrupt_exit12
-+-/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00001458] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001460] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001468] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001470] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00001478] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00001480] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001488] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001490] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001498] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000014e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000014e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_exit1
-++/* [0x000014f0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000014f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001500] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00001508] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001510] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00001518] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001520] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001528] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 6e552d9..760bd17 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,15 +4,16 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 144)
-+-#define mc_filter_uv_b0 (rpi_shader + 334)
-+-#define mc_filter_uv_b (rpi_shader + 486)
-+-#define mc_exit (rpi_shader + 662)
-+-#define mc_interrupt_exit8 (rpi_shader + 680)
-+-#define mc_setup (rpi_shader + 710)
-+-#define mc_filter (rpi_shader + 864)
-+-#define mc_filter_b (rpi_shader + 1104)
-+-#define mc_interrupt_exit12 (rpi_shader + 1360)
-+-#define mc_end (rpi_shader + 1398)
-++#define mc_filter_uv (rpi_shader + 130)
-++#define mc_filter_uv_b0 (rpi_shader + 312)
-++#define mc_filter_uv_b (rpi_shader + 464)
-++#define mc_exit (rpi_shader + 640)
-++#define mc_interrupt_exit8 (rpi_shader + 658)
-++#define mc_setup (rpi_shader + 688)
-++#define mc_filter (rpi_shader + 1048)
-++#define mc_filter_b (rpi_shader + 1174)
-++#define mc_interrupt_exit12 (rpi_shader + 1302)
-++#define mc_exit1 (rpi_shader + 1340)
-++#define mc_end (rpi_shader + 1356)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index a0b8e5a..60d1ec2 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -21,6 +21,7 @@
-+ #
-+ # ra16                                          clipped(row start address+elem_num)&~3
-+ # ra17                                          per-channel shifts
-++# ra18                                          0x4000
-+ # ra19                                          next ra17
-+ #
-+ # rb16                                          pitch
-+@@ -86,7 +87,7 @@
-+ 
-+ 
-+ ################################################################################
-+-# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
-+ ::mc_setup_uv
-+ 
-+ # Read starting kernel
-+@@ -132,36 +133,6 @@ mov ra13, 0
-+ mov ra14, 0
-+ mov ra15, 0
-+ 
-+-# Compute part of VPM to use for DMA output
-+-mov r3, unif
-+-shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+-and r2, r2, 15
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+-shl r0, r0, 5
-+-add rb27, r0, r1
-+-
-+-# Compute part of VPM to save data into
-+-shl r2, r3, 1
-+-and r2, r2, 15    # r2 = bcd0
-+-mov r1, r2        # r1 = bcd0
-+-asr r1, r1, 2     # r1 = bc
-+-shl r1, r1, 6     # r1 = bc000000
-+-mov r0, r2        # r0 = bcd0
-+-and r0, r0, 3     # r0 = d0
-+-add r0, r0, r1    # r0 = bc0000d0
-+-mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+-add rb28, r0, r1
-+-asr r0, r0, 1     # r0 = bc0000d
-+-# Prepare VPM command for 16bit intermediates
-+-mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-+-add rb21, r0, r1
-+-
-+ # Compute base address for first and second access
-+ mov r0, ra_x           # Load x
-+ max r0, r0, 0; mov r1, ra_y # Load y
-+@@ -175,10 +146,31 @@ min r1, r1, rb_frame_height_minus_1
-+ # submit texture requests for first line
-+ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+ add t0s, r0, r1 ; mov ra_frame_base, r2
-+-add t0s, r2, r1
-++add t1s, r2, r1
-++
-++mov r2,8
-++shl rb12,unif, r2 # offset before shift
-++add rb13,unif,r2  # offset after shift
-++
-++# Compute part of VPM to use for DMA output
-++mov r2, unif
-++shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-+ 
-+-mov rb12,unif # offset before shift
-+-mov rb13,unif # offset after shift
-++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-++add rb28, r0, r1  # VPM 8bit storage
-++asr r2, r0, 1     # r0 = bc0000d
-++mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-++add rb21, r2, r1  # VPM for 16bit intermediates
-++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++shl r0, r0, 5
-++add rb27, r0, r1  # DMA out
-+ 
-+ # submit texture requests for second line
-+ max r1, ra_y, 0
-+@@ -187,7 +179,7 @@ add ra_y, ra_y, 1
-+ bra -, ra31
-+ nop ; mul24 r1, r1, rb_pitch
-+ add t0s, r1, ra_x
-+-add t0s, r1, ra_frame_base
-++add t1s, r1, ra_frame_base
-+ 
-+ 
-+ 
-+@@ -248,17 +240,15 @@ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ mov r0, unif # U offset/weight
-+ asr rb15, r0, r2  # Compute offset from MSBs
-+ shl r0, r0, r2
-+-asr rb14, r0, r2  # Compute weight from LSBs
-++asr r3, r0, r2  # Compute weight from LSBs
-+ mov r0, unif # V offset/weight
-+ asr.ifnz rb15, r0, r2
-+ shl r0, r0, r2
-+-asr.ifnz rb14, r0, r2
-++asr.ifnz r3, r0, r2
-++shl rb14,r3,8 # Scale up weights so we can use mul24 in signed fashion
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+-
-+-mov r5rep, -8
-+-
-+ # retrieve texture results and pick out bytes
-+ # then submit two more texture requests
-+ 
-+@@ -269,7 +259,7 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+@@ -278,7 +268,7 @@ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+ add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_frame_base, r2
-++add t1s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -301,11 +291,6 @@ mov ra13, ra14       # Delay slot 1
-+ mov ra14, ra15       # Delay slot 2
-+ mov ra15, r0         # Delay slot 3
-+ 
-+-mov rb12,32 # TODO remove these to make P weighted prediction work properly
-+-mov rb13,6
-+-mov rb14,1
-+-mov rb15,0
-+-
-+ # apply vertical filter and write to VPM
-+ 
-+ nop                     ; mul24 r1, ra14, rb10
-+@@ -412,7 +397,7 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+@@ -421,7 +406,7 @@ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+ add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_frame_base, r2
-++add t1s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -542,7 +527,7 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+@@ -551,7 +536,7 @@ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+ add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_frame_base, r2
-++add t1s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -617,9 +602,9 @@ mov  -, vw_wait # wait on the VDW
-+ mov -,srel(0)
-+ 
-+ ldtmu0
-++ldtmu1
-+ ldtmu0
-+-ldtmu0
-+-ldtmu0
-++ldtmu1
-+ 
-+ nop        ; nop ; thrend
-+ nop        ; nop # delay slot 1
-+@@ -630,9 +615,9 @@ nop        ; nop # delay slot 2
-+ mov  -, vw_wait # wait on the VDW
-+ 
-+ ldtmu0
-++ldtmu1
-+ ldtmu0
-+-ldtmu0
-+-ldtmu0
-++ldtmu1
-+ 
-+ mov -,sacq(0) # 1
-+ mov -,sacq(0) # 2
-+@@ -656,200 +641,249 @@ nop        ; nop # delay slot 2
-+ # For P frames we make the second x,y coordinates offset by +8
-+ 
-+ ################################################################################
-+-# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
-++# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+ ::mc_setup
-++  mov r3, 16
-+ 
-+-# Read starting kernel
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-mov ra31, unif
-+-
-+-# Compute base address for first and second access
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl ra_xshift_next, r0, 3 # Compute shifts
-+-add ra_y, r1, 1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-+-max r1, r1, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+-add t0s, r2, r1 ; mov ra_frame_base, r2
-+-
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl rx_xshift2_next, r0, 3 # Compute shifts
-+-add ra_y2, r1, 1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-+-max r1, r1, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+-add t0s, r2, r1 ; mov ra_frame_base2, r2
-+-
-++  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-++  mov ra8, unif
-++  mov ra9, unif
-++  mov ra10, unif
-++  mov ra11, unif
-+ 
-+ # Read image dimensions
-+-sub rb25,unif,1
-+-sub rb30,unif,1
-++  mov r1, unif # width_height
-++  shl r0,r1,r3
-++  asr r1,r1,r3 # width
-++  asr r0,r0,r3 # height
-++  sub rb_frame_width_minus_1,r1,1
-++  sub rb_frame_height_minus_1,r0,1
-+ 
-+ # get source pitch
-+-mov rb16, unif
-++  mov rb_pitch, unif
-+ 
-+ # get destination pitch
-+-mov r0, unif
-+-mov r1, vdw_setup_1(0)
-+-add rb24, r1, r0
-++  mov r0, unif
-++  mov r1, vdw_setup_1(0)
-++  add rb24, r1, r0
-+ 
-+-# load constants
-+-
-+-mov ra20, 1
-+-mov ra22, 256
-+-mov ra30, 64
-+-
-+-mov rb20, 0xffffff00
-+-mov rb22, 255
-+-mov rb23, 24
-++# Compute base address for first and second access
-++  mov r1, ra8 # y_x
-++  shl r0,r1,r3 # r0 is x<<16
-++  asr r1,r1,r3 # r1 is y
-++  asr r0,r0,r3 # r0 is x
-++  add r0, r0, elem_num # Load x
-++  max r0, r0, 0
-++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
-++  shl ra_xshift_next, r0, 3 # Compute shifts
-++  add ra_y, r1, 1
-++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-++  max r1, r1, 0
-++  min r1, r1, rb_frame_height_minus_1
-++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-++  add t0s, r2, r1 ; mov ra_frame_base, r2
-++
-++  mov r1, ra10 # y_x
-++  shl r0,r1,r3 # r0 is x<<16
-++  asr r1,r1,r3 # r1 is y
-++  asr r0,r0,r3 # r0 is x
-++  add r0, r0, elem_num # Load x
-++  max r0, r0, 0
-++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
-++  shl rx_xshift2_next, r0, 3 # Compute shifts
-++  add ra_y2, r1, 1
-++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-++  max r1, r1, 0
-++  min r1, r1, rb_frame_height_minus_1
-++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-++  add t1s, r2, r1 ; mov ra_frame_base2, r2
-+ 
-+-# touch vertical context to keep simulator happy
-+ 
-+-mov ra8, 0
-+-mov ra9, 0
-+-mov ra10, 0
-+-mov ra11, 0
-+-mov ra12, 0
-+-mov ra13, 0
-+-mov ra14, 0
-+-mov ra15, 0
-++# load constants
-+ 
-+-# Compute part of VPM to use for DMA output
-+-mov r2, qpu_num
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+-shl r0, r0, 5
-+-add rb27, r0, r1
-++  mov ra20, 1
-++  mov ra22, 256
-++  mov ra30, 64
-+ 
-+-# Compute part of VPM to save data into
-+-mov r2, qpu_num   # qpu_num = abcd
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+-add rb28, r0, r1
-++  mov rb20, 0xffffff00
-++  mov rb22, 255
-++  mov rb23, 24
-+ 
-+-mov rb12,unif # offset before shift
-+-mov rb13,unif # shift
-++# touch vertical context to keep simulator happy
-+ 
-+-# Dump padding words
-+-mov r0, unif
-++  mov ra8, 0
-++  mov ra9, 0
-++  mov ra10, 0
-++  mov ra11, 0
-++  mov ra12, 0
-++  mov ra13, 0
-++  mov ra14, 0
-++  mov ra15, 0
-++  mov ra18, 0x4000
-++
-++# Compute part of VPM to use
-++  mov r2, qpu_num
-++  mov r1, r2
-++  asr r1, r1, 2
-++  shl r1, r1, 6
-++  mov r0, r2
-++  and r0, r0, 3
-++  add r0, r0, r1
-++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-++  add rb28, r0, r1  # VPM for saving data
-++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++  shl r0, r0, 5
-++  add rb27, r0, r1  # Command for dma output
-++
-++# Weighted prediction denom
-++
-++  mov r1, unif # offset_shift
-++  shl r0,r1,r3 ; mov r2,8
-++  asr rb13,r0,r3 # shift
-++  asr rb12,r1,r3 # offset
-++  add rb13,rb13,r2    # mul24 is unsigned so scale up into high bits
-++  shl rb12, rb12, r2 # Account for larger shift
-+ 
-+ # submit texture requests for second line
-+-max r1, ra_y, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1
-+-nop ; mul24 r1, r1, rb_pitch
-+-add t0s, r1, ra_frame_base
-+-
-+-max r1, ra_y2, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-bra -, ra31
-+-add ra_y2, ra_y2, 1           # Delay 1
-+-nop ; mul24 r1, r1, rb_pitch  # Delay 2
-+-add t0s, r1, ra_frame_base2   # Delay 3
-+-
-+-
-+-################################################################################
-+-
-+-# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-+-# In a P block, only the first half of coefficients contain used information.
-+-# At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x, ra_x16_base point to the current coordinates for this block
-+-::mc_filter
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-mov ra31, unif
-++  max r1, ra_y, 0
-++  min r1, r1, rb_frame_height_minus_1
-++  add ra_y, ra_y, 1
-++  nop ; mul24 r1, r1, rb_pitch
-++  add t0s, r1, ra_frame_base
-++
-++  max r1, ra_y2, 0
-++  min r1, r1, rb_frame_height_minus_1
-++  add ra_y2, ra_y2, 1
-++  nop ; mul24 r1, r1, rb_pitch
-++  add t1s, r1, ra_frame_base2
-++
-++# FALL THROUGHT TO PER-BLOCK SETUP
-++
-++# Start of per-block setup code
-++# P and B blocks share the same setup code to save on Icache space
-++:per_block_setup
-++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++  mov ra31, unif
-+ 
-+ # per-channel shifts were calculated on the *previous* invocation
-+-
-+-mov ra_xshift, ra_xshift_next
-+-mov rx_xshift2, rx_xshift2_next
-++  mov ra_xshift, ra_xshift_next
-++  mov rx_xshift2, rx_xshift2_next
-+ 
-+ # get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl ra_xshift_next, r0, 3 # Compute shifts
-+-mov ra_y_next, r1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-+-
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0   ; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl rx_xshift2_next, r0, 3 # Compute shifts
-+-add ra_y2_next, r1, 1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-+-
-++  mov r3, 16
-++  mov r1, unif # y_x
-++  shl r0,r1,r3 # r0 is x<<16
-++  asr r1,r1,r3 # r1 is y
-++  asr r0,r0,r3 # r0 is x
-++  add r0, r0, elem_num # Load x
-++  max r0, r0, 0
-++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++  shl ra_xshift_next, r0, 3 # Compute shifts
-++  mov ra_y_next, r1
-++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++  add ra_frame_base_next, r2, r0 ; mov r1, unif # y2_x2
-++
-++  shl r0,r1,r3 # r0 is x2<<16
-++  asr r1,r1,r3 # r1 is y2
-++  asr r0,r0,r3 # r0 is x2
-++  add r0, r0, elem_num # Load x
-++  max r0, r0, 0
-++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++  shl rx_xshift2_next, r0, 3 # Compute shifts
-++  mov ra_y2_next, r1
-++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++  add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-+ 
-+ # set up VPM write
-+-mov vw_setup, rb28
-++  mov vw_setup, rb28
-+ 
-+ # get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-+-shl r0, r0, 7
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-++  mov r0, unif
-++  shr r1, r0, r3 # Extract width
-++  sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++  and r0, r0, rb22 # Extract height
-++  add rb17, r0, 5
-++  add rb18, r0, 7
-++  shl r0, r0, 7
-++  add r0, r0, r1 # Combine width and height of destination area
-++  shl r0, r0, r3 # Shift into bits 16 upwards of the vdw_setup0 register
-++  add rb26, r0, rb27
-+ 
-+ # get filter coefficients and discard unused B frame values
-+-mov r0, unif
-+-mov.ifnz -, unif # Alternate coefficients are unused for P frames
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-mov.ifnz -, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-+-mov.ifnz -, unif
-+-asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-+-mov.ifnz -, unif
-+-asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb4, r0, rb23
-+-
-+-mov r0, unif # Frame0 offset/weight
-+-mov.ifnz -, unif # Frame1 offset/weight unused
-+-asr rb15, r0, r2  # Compute offset from MSBs
-+-shl r0, r0, r2
-+-asr rb14, r0, r2  # Compute weight from LSBs
-+-
-+-# r3 is loop counter
-++  mov r0, unif ; mov r1,1  # Packed filter offsets, unpack into ra8... (to be used for vertical context later)
-++  asr ra9, r0, rb23;      mul24 r0, r0, ra22 # my2
-++  asr ra8, r0, rb23;      mul24 r0, r0, ra22 # mx2
-++  asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22 # my:my2
-++  asr.ifz ra8, r0, rb23                      # mx:mx2
-++  sub ra9,3,ra9
-++  sub ra8,3,ra8
-++  shl ra9,ra9,3   # Scale up by 8
-++  shl ra8,ra8,3   # Scale up by 8
-++# Now if we want aligned we have a mul of 1, so put 0 coefficients at the top
-++  mov r1,0xffff00
-++  shl r0, r1, ra8
-++  asr ra0, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb4, r0, rb23
-++
-++  mov r1,0x1040400
-++  shl r0, r1, ra8
-++  asr ra1, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb5, r0, rb23
-++
-++  mov r1,0xfbf5f600
-++  shl r0, r1, ra8
-++  asr ra2, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb6, r0, rb23
-++
-++  mov r1,0x11283a40
-++  shl r0, r1, ra8
-++  asr ra3, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb7, r0, rb23
-++
-++  mov r1,0x3a281100
-++  shl r0, r1, ra8
-++  asr ra4, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb8, r0, rb23
-++
-++  mov r1,0xf6f5fb00
-++  shl r0, r1, ra8
-++  asr ra5, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb9, r0, rb23
-++
-++  mov r1,0x4040100
-++  shl r0, r1, ra8
-++  asr ra6, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb10, r0, rb23
-++
-++  mov r1,0xffff0000
-++  shl r0, r1, ra8
-++  asr ra7, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb11, r0, rb23
-++
-++# Extract weighted prediction information
-++  mov r0, unif      # offset/weight  TODO move up
-++  asr rb15, r0, r3  # Compute offset from MSBs
-++  bra -, ra31
-++  shl r0, r0, r3    #                                                            Delay 1
-++  asr r0, r0, r3 ; mov r3, 0 # Compute weight from LSBs and reset loop counter   Delay 2
-++  shl rb14, r0, 8 # Use a larger shift to avoid unsigned multiply problem        Delay 3
-+ 
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-++################################################################################
-++# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-++# In a P block, y2_x2 should be y_x+8
-++# At this point we have already issued two pairs of texture requests for the current block
-+ 
-+-mov r3, 0
-++::mc_filter
-+ 
-+ :yloop
-+ # retrieve texture results and pick out bytes
-+@@ -858,91 +892,90 @@ mov r3, 0
-+ # If we knew there was no clipping then this code would get simpler.
-+ # Perhaps we could add on the pitch and clip using larger values?
-+ 
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, rx_xshift2
-+-mov.ifz ra_y2, ra_y2_next
-++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++  shr r1, r4, rx_xshift2
-++  mov.ifz ra_y2, ra_y2_next
-+ 
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y2, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++  max r2, ra_y, 0  # y
-++  min r2, r2, rb_frame_height_minus_1
-++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+ 
-++  max r2, ra_y2, 0  # y
-++  min r2, r2, rb_frame_height_minus_1
-++  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+ 
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+ # apply horizontal filter
-+-nop                  ; mul24 r2, r0, ra0
-+-nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-add r0, r2, r3       ; mov r3, rb31
-+-sub.setf -, r3, 8    ; mov ra12, ra13
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-brr.anyn -, r:yloop
-+-mov ra13, ra14       # Delay slot 1
-+-mov ra14, ra15       # Delay slot 2
-+-mov ra15, r0         # Delay slot 3
-++  nop                  ; mul24 r2, r0, ra0
-++  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++  nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++  add r0, r2, r3       ; mov r3, rb31
-++  sub.setf -, r3, 8    ; mov ra8, ra9
-++  mov ra9, ra10
-++  mov ra10, ra11
-++  mov ra11, ra12
-++  mov ra12, ra13
-++  brr.anyn -, r:yloop
-++  mov ra13, ra14       # Delay slot 1
-++  mov ra14, ra15       # Delay slot 2
-++  mov ra15, r0         # Delay slot 3
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r1, ra14, rb10
-+-nop                     ; mul24 r0, ra13, rb9
-+-add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-
-+-add r1, r1, r0          ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 14
-+-nop                     ; mul24 r1, r1, rb14
-+-add r1, r1, rb12
-+-asr r1, r1, rb13
-+-brr.anyn -, r:yloop
-+-add r1, r1, rb15       # Delay 1
-+-min r1, r1, rb22       # Delay 2
-+-max vpm, r1, 0         # Delay 3
-++  nop                     ; mul24 r1, ra14, rb10
-++  nop                     ; mul24 r0, ra13, rb9
-++  add r1, r1, r0          ; mul24 r0, ra12, rb8
-++  add r1, r1, r0          ; mul24 r0, ra15, rb11
-++  add r1, r1, r0          ; mul24 r0, ra8, rb4
-++  add r1, r1, r0          ; mul24 r0, ra9, rb5
-++  add r1, r1, r0          ; mul24 r0, ra10, rb6
-++  add r1, r1, r0          ; mul24 r0, ra11, rb7
-++
-++  add r1, r1, r0          ; mov -, vw_wait
-++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++  asr r1, r1, 14
-++  nop                     ; mul24 r1, r1, rb14
-++  add r1, r1, rb12
-++  asr r1, r1, rb13
-++  brr.anyn -, r:yloop
-++  add r1, r1, rb15       # Delay 1
-++  min r1, r1, rb22       # Delay 2
-++  max vpm, r1, 0         # Delay 3
-+ 
-+ # DMA out
-+ 
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW setup 0    Delay 1
-+-mov vw_setup, rb29 # Stride         Delay 2
-+-mov vw_addr, unif # start the VDW   Delay 3
-++  brr -, r:per_block_setup
-++  mov vw_setup, rb26 # VDW setup 0    Delay 1
-++  mov vw_setup, rb29 # Stride         Delay 2
-++  mov vw_addr, unif # start the VDW   Delay 3
-+ 
-+ 
-+ 
-+ ################################################################################
-+ 
-+-# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-++# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+ # In a P block, only the first half of coefficients contain used information.
-+ # At this point we have already issued two pairs of texture requests for the current block
-+ # May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-+@@ -952,92 +985,6 @@ mov vw_addr, unif # start the VDW   Delay 3
-+ # Or possibly by taking advantage of symmetry?
-+ # From 19->7 32bits per command.
-+ ::mc_filter_b
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-mov ra31, unif
-+-
-+-# per-channel shifts were calculated on the *previous* invocation
-+-
-+-mov ra_xshift, ra_xshift_next
-+-mov rx_xshift2, rx_xshift2_next
-+-
-+-# get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl ra_xshift_next, r0, 3 # Compute shifts
-+-mov ra_y_next, r1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-+-
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0   ; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl rx_xshift2_next, r0, 3 # Compute shifts
-+-add ra_y2_next, r1, 1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-+-
-+-
-+-# set up VPM write
-+-mov vw_setup, rb28
-+-
-+-# get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-+-shl r0, r0, 7
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-+-
-+-# get filter coefficients and discard unused B frame values
-+-mov r0, unif
-+-mov r1, 1
-+-mov.ifnz r0, unif # Alternate coefficients are unused for P frames
-+-nop              ;      mul24 r0, r0 << 13, r1 << 13
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 14, r1 << 14
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-mov.ifnz r0, unif
-+-nop              ;      mul24 r0, r0 << 9, r1 << 9
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 10, r1 << 10
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 11, r1 << 11
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 12, r1 << 12
-+-asr ra4, r0, rb23;      mov r0, unif
-+-mov.ifnz r0, unif
-+-asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-+-mov.ifnz r0, unif
-+-asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb4, r0, rb23
-+-
-+-mov r0, unif # Frame0 offset/weight
-+-mov.ifnz r0, unif # Frame1 offset/weight unused
-+-asr rb15, r0, r2  # Compute offset from MSBs
-+-shl r0, r0, r2
-+-asr rb14, r0, r2  # Compute weight from LSBs
-+-
-+-# r3 is loop counter
-+-
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-mov r3, 0
-+-
-+ :yloopb
-+ # retrieve texture results and pick out bytes
-+ # then submit two more texture requests
-+@@ -1045,111 +992,123 @@ mov r3, 0
-+ # If we knew there was no clipping then this code would get simpler.
-+ # Perhaps we could add on the pitch and clip using larger values?
-+ 
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, rx_xshift2
-+-mov.ifz ra_y2, ra_y2_next
-++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++  shr r1, r4, rx_xshift2
-++  mov.ifz ra_y2, ra_y2_next
-+ 
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y2, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++  max r2, ra_y, 0  # y
-++  min r2, r2, rb_frame_height_minus_1
-++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+ 
-++  max r2, ra_y2, 0  # y
-++  min r2, r2, rb_frame_height_minus_1
-++  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+ 
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+ # apply horizontal filter
-+-nop                  ; mul24 r2, r0, ra0
-+-nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-add r0, r2, r3       ; mov r3, rb31
-+-sub.setf -, r3, 8    ; mov ra12, ra13
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-brr.anyn -, r:yloopb
-+-mov ra13, ra14       # Delay slot 1
-+-mov ra14, ra15       # Delay slot 2
-+-mov ra15, r0         # Delay slot 3
-+-
-+-# apply vertical filter and write to VPM
-+-
-+-nop                     ; mul24 r1, ra14, rb10
-+-nop                     ; mul24 r0, ra13, rb9
-+-add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-
-+-add r1, r1, r0          ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 14
-+-nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
-+-add r1, r1, ra30        ; mul24 r0, r1, rb14
-+-add r1, r1, r0
-+-brr.anyn -, r:yloopb
-+-asr r1, r1, 7          # Delay 1
-+-min r1, r1, rb22       # Delay 2
-+-max vpm, r1, 0         # Delay 3
-++  nop                  ; mul24 r2, r0, ra0
-++  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++  nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++  add r0, r2, r3       ; mov r3, rb31
-++  sub.setf -, r3, 8    ; mov ra8, ra9
-++  mov ra9, ra10
-++  mov ra10, ra11
-++  mov ra11, ra12
-++  mov ra12, ra13
-++  brr.anyn -, r:yloopb
-++  mov ra13, ra14       # Delay slot 1
-++  mov ra14, ra15       # Delay slot 2
-++  mov ra15, r0         # Delay slot 3
-++
-++  # apply vertical filter and write to VPM
-++
-++  nop                     ; mul24 r1, ra14, rb10
-++  nop                     ; mul24 r0, ra13, rb9
-++  add r1, r1, r0          ; mul24 r0, ra12, rb8
-++  add r1, r1, r0          ; mul24 r0, ra15, rb11
-++  add r1, r1, r0          ; mul24 r0, ra8, rb4
-++  add r1, r1, r0          ; mul24 r0, ra9, rb5
-++  add r1, r1, r0          ; mul24 r0, ra10, rb6
-++  add r1, r1, r0          ; mul24 r0, ra11, rb7
-++
-++  add r1, r1, r0          ; mov -, vw_wait
-++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++  asr r0, r1, 14
-++  asr r1, r1, 6           # Wait state so we can use the rotate instruction
-++  nop                     ; mul24 r0, r0 << 8, ra22 << 8 # Rotate to align left and right halves
-++  add r1, r1, ra18
-++  add r1, r1, r0
-++  brr.anyn -, r:yloopb
-++  asr r1, r1, 15         # Delay 1
-++  min r1, r1, rb22       # Delay 2
-++  max vpm, r1, 0         # Delay 3
-+ 
-+ # DMA out
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW setup 0    Delay 1
-+-mov vw_setup, rb29 # Stride         Delay 2
-+-mov vw_addr, unif # start the VDW   Delay 3
-++  brr -, r:per_block_setup
-++  mov vw_setup, rb26 # VDW setup 0    Delay 1
-++  mov vw_setup, rb29 # Stride         Delay 2
-++  mov vw_addr, unif # start the VDW   Delay 3
-+ 
-+ ################################################################################
-+ 
-+ # mc_interrupt_exit12()
-+ ::mc_interrupt_exit12
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-
-+-mov -,sacq(0) # 1
-+-mov -,sacq(0) # 2
-+-mov -,sacq(0) # 3
-+-mov -,sacq(0) # 4
-+-mov -,sacq(0) # 5
-+-mov -,sacq(0) # 6
-+-mov -,sacq(0) # 7
-+-mov -,sacq(0) # 8
-+-mov -,sacq(0) # 9
-+-mov -,sacq(0) # 10
-+-mov -,sacq(0) # 11
-+-
-+-nop        ; nop ; thrend
-+-mov interrupt, 1; nop # delay slot 1
-+-nop        ; nop # delay slot 2
-++  mov  -, vw_wait # wait on the VDW
-++
-++  ldtmu0
-++  ldtmu0
-++  ldtmu1
-++  ldtmu1
-++
-++  mov -,sacq(0) # 1
-++  mov -,sacq(0) # 2
-++  mov -,sacq(0) # 3
-++  mov -,sacq(0) # 4
-++  mov -,sacq(0) # 5
-++  mov -,sacq(0) # 6
-++  mov -,sacq(0) # 7
-++  mov -,sacq(0) # 8
-++  mov -,sacq(0) # 9
-++  mov -,sacq(0) # 10
-++  mov -,sacq(0) # 11
-++
-++  nop        ; nop ; thrend
-++  mov interrupt, 1; nop # delay slot 1
-++  nop        ; nop # delay slot 2
-++
-++
-++::mc_exit1
-++  mov  -, vw_wait # wait on the VDW
-++
-++  ldtmu0
-++  ldtmu1
-++  ldtmu0
-++  ldtmu1
-++  nop        ; nop ; thrend
-++  mov interrupt, 1; nop # delay slot 1
-++  nop        ; nop # delay slot 2
-+ 
-+ 
-+ ::mc_end
-+-- 
-+2.5.0
-+
-+
-+From 2be17e0759404007c938bdd478e1e76445d9ecbe Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 2 Jun 2015 10:58:25 +0100
-+Subject: [PATCH 48/68] Added option to simulate QPUs
-+
-+---
-+ libavcodec/hevc.c          | 288 +++++++++++++++++++++++++++++++++++++++++++--
-+ libavcodec/rpi_qpu.c       |  24 ++--
-+ libavcodec/rpi_shader.qasm |   6 +-
-+ 3 files changed, 295 insertions(+), 23 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index c6b619b..7914afb 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -54,6 +54,8 @@
-+   // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-+   // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
-+ 
-++  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
-++  //#define RPI_SIMULATE_QPUS
-+ 
-+ #endif
-+ 
-+@@ -122,7 +124,6 @@ static void pic_arrays_free(HEVCContext *s)
-+ 
-+ #ifdef EARLY_MALLOC
-+ #else
-+-    printf("pic_arrays_free\n");
-+     if (s->coeffs_buf_arm[0]) {
-+       gpu_free(&s->coeffs_buf_default);
-+       s->coeffs_buf_arm[0] = 0;
-+@@ -172,11 +173,9 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+ #ifdef RPI
-+ #ifdef EARLY_MALLOC
-+ #else
-+-    assert(sps);
-++    av_assert0(sps);
-+     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-+-    printf("pic_arrays_init\n");
-+-    printf("Allocated %d\n",coefs_per_row);
-+     gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+     s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+     if (!s->coeffs_buf_arm[0])
-+@@ -2975,6 +2974,274 @@ static void rpi_inter_clear(HEVCContext *s)
-+ #endif
-+ }
-+ 
-++
-++#ifdef RPI_SIMULATE_QPUS
-++
-++static int32_t clipx(int x,int FRAME_WIDTH)
-++{
-++	if (x<=0) return 0;
-++	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
-++	return x;
-++}
-++
-++static int32_t clipy(int y,int FRAME_HEIGHT)
-++{
-++	if (y<=0) return 0;
-++	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
-++	return y;
-++}
-++
-++/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
-++{
-++   int32_t vsum = 0;
-++   int x, y;
-++
-++   for (y = 0; y < 8; y++) {
-++      int32_t hsum = 0;
-++
-++      for (x = 0; x < 8; x++)
-++         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
-++
-++      vsum += lumaFilter[my][y]*hsum;
-++   }
-++   vsum >>= 6;
-++   vsum = (((vsum*weight)+round)>>denom)+offset;
-++
-++   return av_clip_uint8( vsum );
-++}*/
-++
-++static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-++{
-++  int32_t vsum = 0;
-++  int x, y;
-++  int chromaFilterH[4];
-++  int chromaFilterV[4];
-++  int i;
-++  int offset_after = offset_weight>>16;
-++  int weight = (offset_weight<<16)>>16;
-++  for(i=0;i<4;i++) {
-++    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
-++    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
-++  }
-++
-++   for (y = 0; y < 4; y++) {
-++      int32_t hsum = 0;
-++
-++      for (x = 0; x < 4; x++)
-++         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-++
-++      vsum += chromaFilterV[y]*hsum;
-++   }
-++   vsum >>= 6;
-++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-++
-++   return vsum;
-++}
-++
-++int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
-++
-++static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-++{
-++  int32_t vsum = 0;
-++  int x, y;
-++  int i;
-++  int offset_after = offset_weight>>16;
-++  int weight = (offset_weight<<16)>>16;
-++
-++   for (y = 0; y < 8; y++) {
-++      int32_t hsum = 0;
-++
-++      for (x = 0; x < 8; x++)
-++         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-++
-++      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
-++   }
-++   vsum >>= 6;
-++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-++
-++   return vsum;
-++}
-++
-++static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, int cIdx)
-++{
-++  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
-++  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
-++  int pitch = frame->linesize[cIdx];
-++  uint32_t base = get_vc_address(frame->buf[cIdx]);
-++  if (p>=base && p<base+pitch*pic_height) {
-++    return frame->data[cIdx] + (p-base);
-++  }
-++  return NULL;
-++}
-++
-++static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
-++{
-++  SliceHeader *sh   = &s->sh;
-++  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
-++  int i;
-++  if (arm) return arm;
-++  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
-++  {
-++    for(i=0;i<sh->nb_refs[L0];i++) {
-++      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
-++      if (arm) return arm;
-++    }
-++  }
-++  if (sh->slice_type == B_SLICE) {
-++    for(i=0;i<sh->nb_refs[L1];i++) {
-++      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
-++      if (arm) return arm;
-++    }
-++  }
-++  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
-++  exit(-1);
-++  return NULL;
-++}
-++
-++static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
-++{
-++  uint32_t next_kernel;
-++  uint32_t x0;
-++  uint32_t y0;
-++  uint8_t *ref_u_base;
-++  uint8_t *ref_v_base;
-++  uint32_t frame_width = p[5];
-++  uint32_t frame_height = p[6];
-++  uint32_t pitch = p[7];
-++  uint32_t dst_pitch = p[8];
-++  int32_t offset_before = p[9];
-++  int32_t denom = p[10];
-++  uint32_t vpm_id = p[11];
-++  uint32_t tmp_u_dst[256];
-++  uint32_t tmp_v_dst[256];
-++  while(1) {
-++    p += 12;
-++    next_kernel = p[0-12];
-++    x0 = p[1-12];
-++    y0 = p[2-12];
-++    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
-++      int x,y;
-++      uint32_t width_height = p[5];
-++      uint32_t hcoeffs = p[6];
-++      uint32_t vcoeffs = p[7];
-++      uint32_t offset_weight_u = p[8];
-++      uint32_t offset_weight_v = p[9];
-++      uint8_t *this_u_dst;
-++      uint8_t *this_v_dst;
-++      uint32_t width = width_height >> 16;
-++      uint32_t height = (width_height << 16) >> 16;
-++      ref_u_base = compute_arm_addr(s,p[3-12],1);
-++      ref_v_base = compute_arm_addr(s,p[4-12],2);
-++      if (next_kernel!=s->mc_filter_uv_b0)
-++      {
-++        this_u_dst = compute_arm_addr(s,p[10],1);
-++        this_v_dst = compute_arm_addr(s,p[11],2);
-++      }
-++      for (y=0; y<height; ++y) {
-++        for (x=0; x<width; ++x) {
-++          if (next_kernel==s->mc_filter_uv) {
-++            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
-++            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
-++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-++          } else if (next_kernel==s->mc_filter_uv_b0) {
-++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-++            tmp_u_dst[x+y*16] = refa;
-++            tmp_v_dst[x+y*16] = refb;
-++          } else {
-++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
-++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
-++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-++          }
-++        }
-++      }
-++    } else {
-++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-++      break;
-++    }
-++  }
-++}
-++
-++// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
-++{
-++  uint32_t next_kernel;
-++  int y_x,y2_x2;
-++  uint32_t x0;
-++  uint32_t y0;
-++  uint32_t x2;
-++  uint32_t y2;
-++  uint8_t *ref_y_base;
-++  uint8_t *ref_y2_base;
-++  uint32_t frame_width_height = p[4];
-++  uint32_t frame_width = frame_width_height>>16;
-++  uint32_t frame_height = (frame_width_height<<16)>>16;
-++  uint32_t pitch = p[5];
-++  uint32_t dst_pitch = p[6];
-++  int offset_shift = p[7];
-++  int32_t offset_before = offset_shift>>16;
-++  int32_t denom = (offset_shift<<16)>>16;
-++  while(1) {
-++    p += 9;
-++    next_kernel = p[8-9];
-++    y_x = p[0-9];
-++    x0 = (y_x<<16)>>16;
-++    y0 = y_x>>16;
-++    y2_x2 = p[2-9];
-++    x2 = (y2_x2<<16)>>16;
-++    y2 = y2_x2>>16;
-++
-++    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
-++      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-++      int x,y;
-++      uint32_t width_height = p[4];
-++      uint32_t my2_mx2_my_mx = p[5];
-++      uint32_t offset_weight = p[6];
-++      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-++      uint32_t width = width_height >> 16;
-++      uint32_t height = (width_height << 16) >> 16;
-++      ref_y_base = compute_arm_addr(s,p[1-9],0);
-++      ref_y2_base = compute_arm_addr(s,p[3-9],0);
-++      for (y=0; y<height; ++y) {
-++        for (x=0; x<width; ++x) {
-++          if (next_kernel==s->mc_filter) {
-++            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-++            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-++          }
-++          else {
-++            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-++            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
-++            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-++          }
-++        }
-++      }
-++    } else {
-++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-++      break;
-++    }
-++  }
-++}
-++
-++static void rpi_simulate_inter_qpu(HEVCContext *s)
-++{
-++  // First run the transform as normal
-++  int i;
-++  rpi_execute_transform(s);
-++  for(i=0;i<8;i++)
-++  {
-++    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
-++  }
-++  for(i=0;i<12;i++)
-++  {
-++    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
-++  }
-++}
-++
-++#endif
-++
-++
-+ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ {
-+     int k;
-+@@ -2993,7 +3260,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+-        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-++        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-+     }
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+@@ -3003,11 +3270,16 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+-        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-++        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-+     }
-+     s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ #endif
-+ 
-++#ifdef RPI_SIMULATE_QPUS
-++    rpi_simulate_inter_qpu(s);
-++    s->vpu_id = -1;
-++    return;
-++#endif
-+ 
-+ #ifdef RPI_MULTI_MAILBOX
-+ #ifdef RPI_CACHE_UNIF_MVS
-+@@ -3088,7 +3360,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
-+ #endif
-+ 
-+-    /*if (!s->enable_rpi) {
-++    if (!s->enable_rpi) {
-+       if (s->ps.pps->cross_component_prediction_enabled_flag)
-+         printf("Cross component\n");
-+       if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
-+@@ -3097,7 +3369,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         printf("Weighted P slice\n");
-+       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-+         printf("Weighted B slice\n");
-+-    }*/
-++    }
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index e12304b..4480f72 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -13,7 +13,7 @@
-+ #include <stdlib.h>
-+ #include <string.h>
-+ #include <stddef.h>
-+-#include <assert.h>
-++#include "libavutil/avassert.h"
-+ 
-+ #include "config.h"
-+ 
-+@@ -160,13 +160,13 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   // Now copy over the QPU code into GPU memory
-+   {
-+     int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
-+-    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-++    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-+   }
-+   // And the VPU code
-+   {
-+     int num_bytes = sizeof(rpi_hevc_transform);
-+-    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-+   }
-+   // And the transform coefficients
-+@@ -216,13 +216,13 @@ static void gpu_unlock(void) {
-+ static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-+   p->numbytes = numbytes;
-+   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+-  assert(p->vcsm_handle);
-++  av_assert0(p->vcsm_handle);
-+   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+-  assert(p->vc_handle);
-++  av_assert0(p->vc_handle);
-+   p->arm = vcsm_lock(p->vcsm_handle);
-+-  assert(p->arm);
-++  av_assert0(p->arm);
-+   p->vc = mem_lock(mb, p->vc_handle);
-+-  assert(p->vc);
-++  av_assert0(p->vc);
-+   return 0;
-+ }
-+ 
-+@@ -243,7 +243,7 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+ 
-+ int gpu_get_mailbox(void)
-+ {
-+-  assert(gpu);
-++  av_assert0(gpu);
-+   return gpu->mb;
-+ }
-+ 
-+@@ -297,13 +297,13 @@ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+-  assert(p->vcsm_handle);
-++  av_assert0(p->vcsm_handle);
-+   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+-  assert(p->vc_handle);
-++  av_assert0(p->vc_handle);
-+   p->arm = vcsm_lock(p->vcsm_handle);
-+-  assert(p->arm);
-++  av_assert0(p->arm);
-+   p->vc = mem_lock(gpu->mb, p->vc_handle);
-+-  assert(p->vc);
-++  av_assert0(p->vc);
-+   return 0;
-+ }
-+ 
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 60d1ec2..0686249 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -149,8 +149,8 @@ add t0s, r0, r1 ; mov ra_frame_base, r2
-+ add t1s, r2, r1
-+ 
-+ mov r2,8
-+-shl rb12,unif, r2 # offset before shift
-+-add rb13,unif,r2  # offset after shift
-++shl rb12,unif,r2 # offset before shift
-++add rb13,unif,r2  # denominator
-+ 
-+ # Compute part of VPM to use for DMA output
-+ mov r2, unif
-+@@ -185,7 +185,7 @@ add t1s, r1, ra_frame_base
-+ 
-+ ################################################################################
-+ 
-+-# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-++# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
-+ 
-+ # At this point we have already issued two pairs of texture requests for the current block
-+ # ra_x, ra_x16_base point to the current coordinates for this block
-+-- 
-+2.5.0
-+
-+
-+From 70805b593a428f11dcaf1e558214884601f6c44a Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 2 Jun 2015 13:17:50 +0100
-+Subject: [PATCH 49/68] Increased motion vector memory and fixed block size
-+ computation for non-multiple of 2 block sizes
-+
-+---
-+ libavcodec/hevc.c | 50 +++++++++++++++++++++++++++++++-------------------
-+ 1 file changed, 31 insertions(+), 19 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 7914afb..0d947ea 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -81,11 +81,9 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ 
-+ // Split image of 2048 into parts 64 wide
-+ // So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
-+-// Each block of 64*64
-+-// Smallest CTU size is 16x16, so smallest block is 8x8
-+-// Corresponds to a total of 83kbytes over all 12 QPUs
-++// For each block of 64*64 the smallest block size is 8x4
-+ #define RPI_LUMA_COMMAND_WORDS 9
-+-#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
-++#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-+ 
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+@@ -2029,11 +2027,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             uint32_t *y = s->y_mvs[chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-++                  int bw = nPbW-start_x;
-++                  int bh = nPbH-start_y;
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+-                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                   *y++ = my2_mx2_my_mx;
-+                   if (weight_flag) {
-+                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-+@@ -2076,12 +2076,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      int bw = nPbW_c-start_x;
-++                      int bh = nPbH_c-start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+-                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-+                       if (weight_flag) {
-+@@ -2128,11 +2130,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             uint32_t *y = s->y_mvs[chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-++                  int bw = nPbW-start_x;
-++                  int bh = nPbH-start_y;
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+-                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                   *y++ = my2_mx2_my_mx;
-+                   if (weight_flag) {
-+                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-+@@ -2176,12 +2180,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      int bw = nPbW_c-start_x;
-++                      int bh = nPbH_c-start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+-                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-+@@ -2233,11 +2239,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             uint32_t *y = s->y_mvs[chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-++                  int bw = nPbW-start_x;
-++                  int bh = nPbH-start_y;
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+-                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = ( (bw<8 ? bw : 8) << 16 ) + (bh<16 ? bh : 16);
-+                   *y++ = my2_mx2_my_mx;
-+                   *y++ = 1; // B frame weighted prediction not supported
-+                   *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+@@ -2280,12 +2288,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      int bw = nPbW_c-start_x;
-++                      int bh = nPbH_c-start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+-                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-+                       u+=2; // Weights not supported in B slices
-+@@ -2296,7 +2306,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+-                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                       *u++ = rpi_filter_coefs[_mx2][0];
-+                       *u++ = rpi_filter_coefs[_my2][0];
-+                       u+=2; // Weights not supported in B slices
-+@@ -3165,14 +3175,15 @@ static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
-+ }
-+ 
-+ // mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+-static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
-++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
-+ {
-+   uint32_t next_kernel;
-+   int y_x,y2_x2;
-+-  uint32_t x0;
-+-  uint32_t y0;
-+-  uint32_t x2;
-+-  uint32_t y2;
-++  int x0;
-++  int y0;
-++  int x2;
-++  int y2;
-++  uint32_t *p0 = p;
-+   uint8_t *ref_y_base;
-+   uint8_t *ref_y2_base;
-+   uint32_t frame_width_height = p[4];
-+@@ -3202,13 +3213,15 @@ static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
-+       uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-+       uint32_t width = width_height >> 16;
-+       uint32_t height = (width_height << 16) >> 16;
-++      uint8_t *dst_base = s->frame->data[0];
-+       ref_y_base = compute_arm_addr(s,p[1-9],0);
-+       ref_y2_base = compute_arm_addr(s,p[3-9],0);
-+       for (y=0; y<height; ++y) {
-+         for (x=0; x<width; ++x) {
-+           if (next_kernel==s->mc_filter) {
-+             int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-+-            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-++            refa = av_clip_uint8(refa);
-++            this_dst[x+y*dst_pitch] = refa;
-+           }
-+           else {
-+             int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-+@@ -3235,7 +3248,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-+   }
-+   for(i=0;i<12;i++)
-+   {
-+-    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
-++    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
-+   }
-+ }
-+ 
-+@@ -3277,7 +3290,6 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI_SIMULATE_QPUS
-+     rpi_simulate_inter_qpu(s);
-+-    s->vpu_id = -1;
-+     return;
-+ #endif
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 1bd38623db52970590df65f4a7338d924c63a781 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 2 Jun 2015 14:36:54 +0100
-+Subject: [PATCH 50/68] Added support for skip deblock
-+
-+---
-+ libavcodec/hevc.c        |  5 +++++
-+ libavcodec/hevc.h        |  2 ++
-+ libavcodec/hevc_filter.c | 14 ++++----------
-+ 3 files changed, 11 insertions(+), 10 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 0d947ea..1812801 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3384,6 +3384,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     }
-+ 
-+ #endif
-++    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-++                        s->nal_unit_type == NAL_TSA_N   ||
-++                        s->nal_unit_type == NAL_STSA_N  ||
-++                        s->nal_unit_type == NAL_RADL_N  ||
-++                        s->nal_unit_type == NAL_RASL_N);
-+ 
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 33dedf7..aa4d218 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -916,6 +916,8 @@ typedef struct HEVCContext {
-+     int                 width;
-+     int                 height;
-+ 
-++    int used_for_ref;
-++
-+ #ifdef RPI
-+     int enable_rpi;
-+     HEVCMvCmd *unif_mv_cmds;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 11629e4..14a0952 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -512,16 +512,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                s->ps.pps->transquant_bypass_enable_flag;
-+ 
-+ #ifdef DISABLE_DEBLOCK_NONREF
-+-    if (    s->nal_unit_type == NAL_TRAIL_N ||
-+-            s->nal_unit_type == NAL_TSA_N   ||
-+-            s->nal_unit_type == NAL_STSA_N  ||
-+-            s->nal_unit_type == NAL_RADL_N  ||
-+-            s->nal_unit_type == NAL_RASL_N )
-++    if (!s->used_for_ref)
-+       return; // Don't deblock non-reference frames
-+ #endif
-+ #ifdef DISABLE_DEBLOCK
-+     return;
-+ #endif
-++    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
-++        return;
-+ 
-+     if (x0) {
-+         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
-+@@ -885,11 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
-+ 
-+ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+-    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+-            s->nal_unit_type == NAL_TSA_N   ||
-+-            s->nal_unit_type == NAL_STSA_N  ||
-+-            s->nal_unit_type == NAL_RADL_N  ||
-+-            s->nal_unit_type == NAL_RASL_N )) {
-++    if (s->enable_rpi && s->used_for_ref) {
-+ #ifdef RPI_FAST_CACHEFLUSH
-+         struct vcsm_user_clean_invalid_s iocache = {};
-+         int curr_y = ((int *)f->progress->data)[0];
-+-- 
-+2.5.0
-+
-+
-+From 691cba7253bc997f6e8020542203c5733930d997 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 2 Jun 2015 15:22:52 +0100
-+Subject: [PATCH 51/68] Added support for skip_frame
-+
-+---
-+ libavcodec/hevc.c | 15 ++++++++++-----
-+ 1 file changed, 10 insertions(+), 5 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 1812801..94ff677 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3384,11 +3384,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     }
-+ 
-+ #endif
-+-    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-+-                        s->nal_unit_type == NAL_TSA_N   ||
-+-                        s->nal_unit_type == NAL_STSA_N  ||
-+-                        s->nal_unit_type == NAL_RADL_N  ||
-+-                        s->nal_unit_type == NAL_RASL_N);
-+ 
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+@@ -3848,6 +3843,16 @@ static int decode_nal_unit(HEVCContext *s, const HEVCNAL *nal)
-+         if (ret < 0)
-+             return ret;
-+ 
-++        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-++                        s->nal_unit_type == NAL_TSA_N   ||
-++                        s->nal_unit_type == NAL_STSA_N  ||
-++                        s->nal_unit_type == NAL_RADL_N  ||
-++                        s->nal_unit_type == NAL_RASL_N);
-++
-++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
-++            s->is_decoded = 0;
-++            break;
-++        }
-+         if (s->max_ra == INT_MAX) {
-+             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
-+                 s->max_ra = s->poc;
-+-- 
-+2.5.0
-+
-+
-+From b489872a14709b7e04285e039dff80b75823eb72 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 09:15:38 +0100
-+Subject: [PATCH 52/68] Fixed cache flushing of luma when using old method
-+
-+---
-+ libavcodec/hevc_filter.c | 2 +-
-+ 1 file changed, 1 insertion(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 14a0952..b286bbf 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -919,7 +919,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+         flush_buffer(s->frame->buf[1]);
-+         flush_buffer(s->frame->buf[2]);
-+ #ifdef RPI_LUMA_QPU
-+-        flush_buffer(s->frame->buf[1]);
-++        flush_buffer(s->frame->buf[0]);
-+ #endif
-+ 
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From 6a4811cba68b1c27326300b37e43cdbad45ec45e Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 11:37:27 +0100
-+Subject: [PATCH 53/68] Option to parallelise coefficient decode and inter
-+ prediction and deblock for each frame
-+
-+---
-+ libavcodec/hevc.c              | 701 +++++++++++++++++++++++++++--------------
-+ libavcodec/hevc.h              |  74 +++--
-+ libavcodec/hevc_cabac.c        |  12 +-
-+ libavcodec/hevcpred_template.c |   5 +-
-+ 4 files changed, 522 insertions(+), 270 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 94ff677..594340a 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -41,8 +41,6 @@
-+ 
-+ #ifdef RPI
-+   #include "rpi_qpu.h"
-+-  // For some unknown reason, the code seems to crash if I do a late malloc
-+-  //#define EARLY_MALLOC
-+   // Move Inter prediction into separate pass
-+   #define RPI_INTER
-+ 
-+@@ -56,6 +54,21 @@
-+ 
-+   // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
-+   //#define RPI_SIMULATE_QPUS
-++  #ifdef RPI_WORKER
-++    #include "pthread.h"
-++  #endif
-++
-++  static void rpi_execute_dblk_cmds(HEVCContext *s);
-++  static void rpi_execute_transform(HEVCContext *s);
-++  static void rpi_execute_inter_qpu(HEVCContext *s);
-++  static void rpi_execute_pred_cmds(HEVCContext *s);
-++  static void rpi_execute_inter_cmds(HEVCContext *s);
-++  static void rpi_inter_clear(HEVCContext *s);
-++
-++  // Define INTER_PASS0 to do inter prediction in first pass
-++  //#define INTER_PASS0
-++  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
-++  //#define LAUNCH_PASS0
-+ 
-+ #endif
-+ 
-+@@ -103,6 +116,143 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
-+   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+   return p->vc;
-+ }
-++#endif
-++
-++
-++#ifdef RPI_WORKER
-++
-++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
-++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
-++
-++#define LOG_ENTER
-++#define LOG_EXIT
-++
-++// Call this when we have completed pass0 and wish to trigger pass1 for the current job
-++static void worker_submit_job(HEVCContext *s)
-++{
-++  LOG_ENTER
-++  //pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
-++  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-++  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-++  //pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++// Call this to say we have completed pass1
-++static void worker_complete_middle_job(HEVCContext *s)
-++{
-++  LOG_ENTER
-++  //pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
-++  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-++  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
-++  //pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++// Call this to say we have completed pass2
-++static void worker_complete_job(HEVCContext *s)
-++{
-++  LOG_ENTER
-++  //pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_head++; // This is the only place that can change head so we do not need the mutex
-++  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-++  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
-++  //pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++// Call this to wait for all jobs to have completed at the end of a frame
-++static void worker_wait(HEVCContext *s)
-++{
-++  LOG_ENTER
-++  pthread_mutex_lock(&s->worker_mutex);
-++  while( s->worker_head !=s->worker_tail)
-++  {
-++    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-++  }
-++  pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-++// available to receive the next job.
-++static void worker_pass0_ready(HEVCContext *s)
-++{
-++  LOG_ENTER
-++    pthread_mutex_lock(&s->worker_mutex);
-++    // tail is number of submitted jobs
-++    // head is number of completed jobs
-++    // tail-head is number of outstanding jobs in the queue
-++    // we need to ensure there is at least 1 space left for us to use
-++    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
-++    {
-++      // Wait until another job is completed
-++      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-++    }
-++    pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++static void *worker_start(void *arg)
-++{
-++  HEVCContext *s = (HEVCContext *)arg;
-++  while(1) {
-++    pthread_mutex_lock(&s->worker_mutex);
-++
-++    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
-++    {
-++      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-++    }
-++    pthread_mutex_unlock(&s->worker_mutex);
-++
-++    if (s->kill_worker) {
-++      break;
-++    }
-++    LOG_ENTER
-++    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-++#ifndef LAUNCH_PASS0
-++    rpi_execute_inter_qpu(s);
-++#endif
-++#ifndef INTER_PASS0
-++    // Perform inter prediction
-++    rpi_execute_inter_cmds(s);
-++#endif
-++    // Wait for transform completion
-++    vpu_wait(s->vpu_id);
-++
-++    worker_complete_middle_job(s);
-++    LOG_EXIT
-++  }
-++  return NULL;
-++}
-++
-++static void *worker_deblock_start(void *arg)
-++{
-++  HEVCContext *s = (HEVCContext *)arg;
-++  while(1) {
-++    pthread_mutex_lock(&s->worker_mutex);
-++    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
-++    {
-++      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
-++    }
-++    pthread_mutex_unlock(&s->worker_mutex);
-++
-++    if (s->kill_worker) {
-++      break;
-++    }
-++    LOG_ENTER
-++    // Perform intra prediction and residual reconstruction
-++    rpi_execute_pred_cmds(s);
-++    // Perform deblocking for CTBs in this row
-++    rpi_execute_dblk_cmds(s);
-++
-++    worker_complete_job(s);
-++    LOG_EXIT
-++  }
-++  return NULL;
-++}
-+ 
-+ #endif
-+ 
-+@@ -119,19 +269,18 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
-+ static void pic_arrays_free(HEVCContext *s)
-+ {
-+ #ifdef RPI
-+-
-+-#ifdef EARLY_MALLOC
-+-#else
-+-    if (s->coeffs_buf_arm[0]) {
-+-      gpu_free(&s->coeffs_buf_default);
-+-      s->coeffs_buf_arm[0] = 0;
-+-    }
-+-    if (s->coeffs_buf_arm[2]) {
-+-      gpu_free(&s->coeffs_buf_accelerated);
-+-      s->coeffs_buf_arm[2] = 0;
-++    int job;
-++    for(job=0;job<RPI_MAX_JOBS;job++) {
-++      if (s->coeffs_buf_arm[job][0]) {
-++        gpu_free(&s->coeffs_buf_default[job]);
-++        s->coeffs_buf_arm[job][0] = 0;
-++      }
-++      if (s->coeffs_buf_arm[job][2]) {
-++        gpu_free(&s->coeffs_buf_accelerated[job]);
-++        s->coeffs_buf_arm[job][2] = 0;
-++      }
-+     }
-+ #endif
-+-#endif
-+     av_freep(&s->sao);
-+     av_freep(&s->deblock);
-+ 
-+@@ -169,24 +318,26 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
-+ 
-+ #ifdef RPI
-+-#ifdef EARLY_MALLOC
-+-#else
-+     av_assert0(sps);
-+     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-+-    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+-    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+-    if (!s->coeffs_buf_arm[0])
-+-        goto fail;
-+-    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+-    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+-    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+-    if (!s->coeffs_buf_arm[2])
-+-        goto fail;
-+-    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+-    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+-    printf("Done\n");
-+-#endif
-++    int job;
-++    for(job=0;job<RPI_MAX_JOBS;job++) {
-++      printf("Allocated %d\n",coefs_per_row);
-++      for(job=0;job<RPI_MAX_JOBS;job++) {
-++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-++        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-++        if (!s->coeffs_buf_arm[job][0])
-++            goto fail;
-++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
-++        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-++        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-++        if (!s->coeffs_buf_arm[job][2])
-++            goto fail;
-++        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
-++        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-++      }
-++    }
-+ #endif
-+ 
-+     s->bs_width  = (width  >> 2) + 1;
-+@@ -1023,7 +1174,7 @@ static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0,
-+ {
-+     if (s->enable_rpi) {
-+         HEVCLocalContext *lc = s->HEVClc;
-+-        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+         cmd->type = RPI_PRED_INTRA;
-+         cmd->size = log2_trafo_size;
-+         cmd->c_idx = c_idx;
-+@@ -1483,7 +1634,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                         int block_w, int block_h, int luma_weight, int luma_offset)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+     cmd->cmd = RPI_CMD_LUMA_UNI;
-+     cmd->dst = dst;
-+     cmd->dststride = dststride;
-+@@ -1502,7 +1653,7 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+     cmd->cmd = RPI_CMD_LUMA_BI;
-+     cmd->dst = dst;
-+     cmd->dststride = dststride;
-+@@ -1524,7 +1675,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-+                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+     cmd->cmd = RPI_CMD_CHROMA_UNI;
-+     cmd->dst = dst0;
-+     cmd->dststride = dststride;
-+@@ -1542,7 +1693,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+ static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-+     cmd->dst = dst0;
-+     cmd->dststride = dststride;
-+@@ -2024,7 +2175,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int chan = x0>>6; // 64 wide blocks per QPU
-+             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+-            uint32_t *y = s->y_mvs[chan % 12];
-++            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                   int bw = nPbW-start_x;
-+@@ -2044,7 +2195,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                 }
-+             }
-+-            s->y_mvs[chan % 12] = y;
-++            s->y_mvs[s->pass0_job][chan % 12] = y;
-+         } else
-+ #endif
-+         {
-+@@ -2073,7 +2224,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+-                uint32_t *u = s->u_mvs[chan & 7];
-++                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2097,7 +2248,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[chan & 7] = u;
-++                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2127,7 +2278,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int chan = x0>>6; // 64 wide blocks per QPU
-+             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+-            uint32_t *y = s->y_mvs[chan % 12];
-++            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                   int bw = nPbW-start_x;
-+@@ -2147,7 +2298,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                 }
-+             }
-+-            s->y_mvs[chan % 12] = y;
-++            s->y_mvs[s->pass0_job][chan % 12] = y;
-+         } else
-+ #endif
-+ 
-+@@ -2177,7 +2328,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+-                uint32_t *u = s->u_mvs[chan & 7];
-++                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2202,7 +2353,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[chan & 7] = u;
-++                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2236,7 +2387,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int x2 = x0 + (mv2->x >> 2);
-+             int y2 = y0 + (mv2->y >> 2);
-+             int chan = x0>>6; // 64 wide blocks per QPU
-+-            uint32_t *y = s->y_mvs[chan % 12];
-++            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                   int bw = nPbW-start_x;
-+@@ -2252,7 +2403,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-+                 }
-+             }
-+-            s->y_mvs[chan % 12] = y;
-++            s->y_mvs[s->pass0_job][chan % 12] = y;
-+         } else
-+ #endif
-+         {
-+@@ -2285,7 +2436,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+ 
-+                 int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+ 
-+-                uint32_t *u = s->u_mvs[chan & 7];
-++                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2314,7 +2465,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[chan & 7] = u;
-++                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2819,40 +2970,54 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+ static void rpi_execute_dblk_cmds(HEVCContext *s)
-+ {
-+     int n;
-++    int job = s->pass2_job;
-+     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+-    int (*p)[2] = s->dblk_cmds;
-+-    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
-++    int (*p)[2] = s->dblk_cmds[job];
-++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-+         ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
-+     }
-+-    s->num_dblk_cmds = 0;
-++    s->num_dblk_cmds[job] = 0;
-+ }
-+ 
-+ static void rpi_execute_transform(HEVCContext *s)
-+ {
-+     int i=2;
-++#ifdef LAUNCH_PASS0
-++    int job = s->pass0_job;
-++#else
-++    int job = s->pass1_job;
-++#endif
-+     //int j;
-+     //int16_t *coeffs = s->coeffs_buf_arm[i];
-+     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-    gpu_cache_flush(&s->coeffs_buf_accelerated);
-+-    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-++    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-++    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
-++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
-++                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
-+     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+     //vpu_wait(s->vpu_id);
-+ 
-+     for(i=0;i<4;i++)
-+-        s->num_coeffs[i] = 0;
-++        s->num_coeffs[job][i] = 0;
-+ }
-+ 
-+ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ {
-+   int i;
-+-  HEVCPredCmd *cmd = s->univ_pred_cmds;
-++  int job = s->pass2_job;
-++  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-++#ifdef RPI_WORKER
-++  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-++#else
-+   HEVCLocalContext *lc = s->HEVClc;
-++#endif
-+ 
-+-  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
-++  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
-++      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
-+       if (cmd->type == RPI_PRED_INTRA) {
-+           lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-+           lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-+@@ -2871,21 +3036,26 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ #endif
-+       }
-+   }
-+-  s->num_pred_cmds = 0;
-++  s->num_pred_cmds[job] = 0;
-+ }
-+ 
-+ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds;
-++#ifdef INTER_PASS0
-++    int job = s->pass0_job;
-++#else
-++    int job = s->pass1_job;
-++#endif
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-+     int n,cidx;
-+     AVFrame myref;
-+     AVFrame myref1;
-+     struct MvField mymv;
-+-    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
-++    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
-+         printf("Overflow inter_cmds\n");
-+         exit(-1);
-+     }
-+-    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
-++    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
-+         switch(cmd->cmd) {
-+         case RPI_CMD_LUMA_UNI:
-+             myref.data[0] = cmd->src;
-+@@ -2925,7 +3095,28 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-+             break;
-+         }
-+     }
-+-    s->num_mv_cmds = 0;
-++    s->num_mv_cmds[job] = 0;
-++}
-++
-++static void rpi_do_all_passes(HEVCContext *s)
-++{
-++#ifdef RPI_INTER_QPU
-++    // Kick off inter prediction on QPUs
-++    rpi_execute_inter_qpu(s);
-++#else
-++    rpi_execute_transform(s);
-++#endif
-++    // Perform luma inter prediction
-++    rpi_execute_inter_cmds(s);
-++    // Wait for transform completion
-++    vpu_wait(s->vpu_id);
-++    // Perform intra prediction and residual reconstruction
-++    rpi_execute_pred_cmds(s);
-++    // Perform deblocking for CTBs in this row
-++    rpi_execute_dblk_cmds(s);
-++#ifdef RPI_INTER_QPU
-++    rpi_inter_clear(s);
-++#endif
-+ }
-+ 
-+ #endif
-+@@ -2933,6 +3124,7 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ #ifdef RPI_INTER_QPU
-+ static void rpi_inter_clear(HEVCContext *s)
-+ {
-++    int job = s->pass0_job;
-+     int i;
-+     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-+@@ -2940,51 +3132,50 @@ static void rpi_inter_clear(HEVCContext *s)
-+                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+     for(i=0;i<8;i++) {
-+-        s->u_mvs[i] = s->mvs_base[i];
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = pic_width;
-+-        *s->u_mvs[i]++ = pic_height;
-+-        *s->u_mvs[i]++ = s->frame->linesize[1];
-+-        *s->u_mvs[i]++ = s->frame->linesize[2];
-++        s->u_mvs[job][i] = s->mvs_base[job][i];
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = pic_width;
-++        *s->u_mvs[job][i]++ = pic_height;
-++        *s->u_mvs[job][i]++ = s->frame->linesize[1];
-++        *s->u_mvs[job][i]++ = s->frame->linesize[2];
-+         if (weight_flag) {
-+-            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-+-            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-++            *s->u_mvs[job][i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-++            *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
-+         } else {
-+-            *s->u_mvs[i]++ = 1 << 5;
-+-            *s->u_mvs[i]++ = 6;
-++            *s->u_mvs[job][i]++ = 1 << 5;
-++            *s->u_mvs[job][i]++ = 6;
-+         }
-+-        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-++        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+     }
-+ 
-+ #ifdef RPI_LUMA_QPU
-+     for(i=0;i<12;i++) {
-+-        s->y_mvs[i] = s->y_mvs_base[i];
-+-        *s->y_mvs[i]++ = 0; // y_x
-+-        *s->y_mvs[i]++ = 0; // ref_y_base
-+-        *s->y_mvs[i]++ = 0; // y2_x2
-+-        *s->y_mvs[i]++ = 0; // ref_y2_base
-+-        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-+-        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
-+-        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
-++        s->y_mvs[job][i] = s->y_mvs_base[job][i];
-++        *s->y_mvs[job][i]++ = 0; // y_x
-++        *s->y_mvs[job][i]++ = 0; // ref_y_base
-++        *s->y_mvs[job][i]++ = 0; // y2_x2
-++        *s->y_mvs[job][i]++ = 0; // ref_y2_base
-++        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
-++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
-+         if (weight_flag) {
-+             int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
-+             int shift = s->sh.luma_log2_weight_denom + 6;
-+-            *s->y_mvs[i]++ = (offset << 16) + shift;
-++            *s->y_mvs[job][i]++ = (offset << 16) + shift;
-+         } else {
-+             int offset = 1 << 5;
-+             int shift = 6;
-+-            *s->y_mvs[i]++ = (offset << 16) + shift;
-++            *s->y_mvs[job][i]++ = (offset << 16) + shift;
-+         }
-+-        *s->y_mvs[i]++ = 0; // Next kernel
-++        *s->y_mvs[job][i]++ = 0; // Next kernel
-+     }
-+ #endif
-+ }
-+ 
-+-
-+ #ifdef RPI_SIMULATE_QPUS
-+ 
-+ static int32_t clipx(int x,int FRAME_WIDTH)
-+@@ -3258,10 +3449,15 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-+ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ {
-+     int k;
-++#ifdef LAUNCH_PASS0
-++    int job = s->pass0_job;
-++#else
-++    int job = s->pass1_job;
-++#endif
-+     int i;
-+-    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
-+ #ifdef RPI_LUMA_QPU
-+-    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
-++    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
-+ #endif
-+     if (s->sh.slice_type == I_SLICE) {
-+ #ifdef RPI_MULTI_MAILBOX
-+@@ -3270,22 +3466,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ #endif
-+     }
-+     for(k=0;k<8;k++) {
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+-        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-++        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
-+     }
-+ 
-+-    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ 
-+ #ifdef RPI_LUMA_QPU
-+     for(k=0;k<12;k++) {
-+-        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+-        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+-        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+-        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
-+     }
-+-    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++    s->y_mvs[job][12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ #endif
-+ 
-+ #ifdef RPI_SIMULATE_QPUS
-+@@ -3295,34 +3491,34 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI_MULTI_MAILBOX
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
-++    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-+ #else
-+-    gpu_cache_flush(&s->coeffs_buf_accelerated);
-++    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+ #endif
-+-    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+ #ifdef RPI_LUMA_QPU
-+                                    qpu_get_fn(QPU_MC_SETUP),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
-+ #else
-+                                    0,
-+                                    0,0,0,0,
-+@@ -3331,17 +3527,17 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ #endif
-+                                  );
-+     for(i=0;i<4;i++)
-+-        s->num_coeffs[i] = 0;
-++        s->num_coeffs[job][i] = 0;
-+ #else
-+     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+-      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
-+       );
-+ #endif
-+ 
-+@@ -3398,6 +3594,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         }
-+     }
-+ 
-++#ifdef RPI_WORKER
-++    s->pass0_job = 0;
-++    s->pass1_job = 0;
-++    s->pass2_job = 0;
-++#endif
-+ #ifdef RPI_INTER_QPU
-+     rpi_inter_clear(s);
-+ #endif
-+@@ -3418,46 +3619,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-++
-+ #ifdef RPI
-+         if (s->enable_rpi) {
-+-          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-+-          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+-            // Transform all blocks
-+-            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+-#ifdef RPI_MULTI_MAILBOX
-+-            // Kick off inter prediction on QPUs
-+-            rpi_execute_inter_qpu(s);
-+-            // Perform luma inter prediction
-+-            rpi_execute_inter_cmds(s);
-+-#else
-+-            rpi_execute_transform(s);
-+-            // Perform inter prediction
-+-            rpi_execute_inter_cmds(s);
-+-#ifdef RPI_INTER_QPU
-+-            // Kick off inter prediction on QPUs
-+-            rpi_execute_inter_qpu(s);
-+-#endif
-+-#endif
-+-
-+-            // Wait for transform completion
-+-            vpu_wait(s->vpu_id);
-+-
-+-            // Copy back reconstructed data
-+-            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
-+-            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
-+-            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
-++#ifdef RPI_WORKER
-++            if (s->used_for_ref) {
-++              // Split work load onto separate threads so we make as rapid progress as possible with this frame
-++  #ifdef INTER_PASS0
-++              rpi_execute_inter_cmds(s);
-++  #endif
-++  #ifdef LAUNCH_PASS0
-++              rpi_execute_inter_qpu(s);
-++  #endif
-++              // Pass on this job to worker thread
-++              worker_submit_job(s);
-++              // Make sure we have space to prepare the next job
-++              worker_pass0_ready(s);
-+ 
-+-            // Perform intra prediction and residual reconstruction
-+-            rpi_execute_pred_cmds(s);
-+-            // Perform deblocking for CTBs in this row
-+-            rpi_execute_dblk_cmds(s);
-++              // Prepare the next batch of commands
-+ #ifdef RPI_INTER_QPU
-+-            rpi_inter_clear(s);
-++              rpi_inter_clear(s);
-++#endif
-++            } else {
-++              // Non-ref frame so do it all on this thread
-++              rpi_do_all_passes(s);
-++            }
-++#else
-++            rpi_do_all_passes(s);
-+ #endif
-+           }
-+         }
-+ #endif
-++
-++
-+         if (more_data < 0) {
-+             s->tab_slice_address[ctb_addr_rs] = -1;
-+             return more_data;
-+@@ -3474,18 +3671,21 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     }
-+ 
-+ #ifdef RPI
-+-    if (s->enable_rpi && s->num_dblk_cmds) {
-+-#ifdef RPI_INTER_QPU
-+-        rpi_execute_inter_qpu(s);
-+-#endif
-+-#ifndef RPI_MULTI_MAILBOX
-+-        rpi_execute_transform(s);
-++
-++#ifdef RPI_WORKER
-++    // Wait for the worker to finish all its jobs
-++    if (s->enable_rpi) {
-++        worker_wait(s);
-++        av_assert0(s->pass0_job==s->pass1_job);
-++        av_assert0(s->pass1_job==s->pass2_job);
-++    }
-+ #endif
-+-        rpi_execute_inter_cmds(s);
-+-        vpu_wait(s->vpu_id);
-+-        rpi_execute_pred_cmds(s);
-+-        rpi_execute_dblk_cmds(s);
-++
-++    // Finish off any half-completed rows
-++    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
-++        rpi_do_all_passes(s);
-+     }
-++
-+ #endif
-+ 
-+     if (x_ctb + ctb_size >= s->ps.sps->width &&
-+@@ -4153,6 +4353,48 @@ fail:
-+     return AVERROR(ENOMEM);
-+ }
-+ 
-++#ifdef RPI_WORKER
-++static av_cold void hevc_init_worker(HEVCContext *s)
-++{
-++    int err;
-++    pthread_cond_init(&s->worker_cond_head, NULL);
-++    pthread_cond_init(&s->worker_cond_middle, NULL);
-++    pthread_cond_init(&s->worker_cond_tail, NULL);
-++    pthread_mutex_init(&s->worker_mutex, NULL);
-++
-++    s->worker_tail=0;
-++    s->worker_middle=0;
-++    s->worker_head=0;
-++    s->kill_worker=0;
-++    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
-++    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
-++    if (err) {
-++        printf("Failed to create worker thread\n");
-++        exit(-1);
-++    }
-++}
-++
-++static av_cold void hevc_exit_worker(HEVCContext *s)
-++{
-++    void *res;
-++    s->kill_worker=1;
-++    pthread_cond_broadcast(&s->worker_cond_tail);
-++    pthread_cond_broadcast(&s->worker_cond_middle);
-++    pthread_join(s->worker_thread, &res);
-++    pthread_join(s->worker_deblock_thread, &res);
-++
-++    pthread_cond_destroy(&s->worker_cond_head);
-++    pthread_cond_destroy(&s->worker_cond_middle);
-++    pthread_cond_destroy(&s->worker_cond_tail);
-++    pthread_mutex_destroy(&s->worker_mutex);
-++
-++    s->worker_tail=0;
-++    s->worker_middle=0;
-++    s->worker_head=0;
-++    s->kill_worker=0;
-++}
-++#endif
-++
-+ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+ {
-+     HEVCContext       *s = avctx->priv_data;
-+@@ -4165,33 +4407,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->cabac_state);
-+ 
-+ #ifdef RPI
-+-    av_freep(&s->unif_mv_cmds);
-+-    av_freep(&s->univ_pred_cmds);
-++
-++#ifdef RPI_WORKER
-++    hevc_exit_worker(s);
-++#endif
-++
-++    for(i=0;i<RPI_MAX_JOBS;i++) {
-++      av_freep(&s->unif_mv_cmds[i]);
-++      av_freep(&s->univ_pred_cmds[i]);
-+ 
-+ #ifdef RPI_INTER_QPU
-+-    if (s->unif_mvs) {
-+-        gpu_free( &s->unif_mvs_ptr );
-+-        s->unif_mvs = 0;
-+-    }
-++      if (s->unif_mvs[i]) {
-++        gpu_free( &s->unif_mvs_ptr[i] );
-++        s->unif_mvs[i] = 0;
-++      }
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-+-    if (s->y_unif_mvs) {
-+-        gpu_free( &s->y_unif_mvs_ptr );
-+-        s->y_unif_mvs = 0;
-+-    }
-++      if (s->y_unif_mvs[i]) {
-++        gpu_free( &s->y_unif_mvs_ptr[i] );
-++        s->y_unif_mvs[i] = 0;
-++      }
-+ #endif
-+-
-+-#ifdef EARLY_MALLOC
-+-    printf("hevc_decode_free\n");
-+-    if (s->coeffs_buf_arm[0]) {
-+-      gpu_free(&s->coeffs_buf_default);
-+-      s->coeffs_buf_arm[0] = 0;
-+-    }
-+-    if (s->coeffs_buf_arm[2]) {
-+-      gpu_free(&s->coeffs_buf_accelerated);
-+-      s->coeffs_buf_arm[2] = 0;
-+     }
-+-#endif
-++
-+ #endif
-+ 
-+     for (i = 0; i < 3; i++) {
-+@@ -4256,6 +4494,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+ {
-+     HEVCContext *s = avctx->priv_data;
-+     int i;
-++    int job;
-+ 
-+     s->avctx = avctx;
-+ 
-+@@ -4266,12 +4505,14 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->sList[0] = s;
-+ 
-+ #ifdef RPI
-+-    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+-    if (!s->unif_mv_cmds)
-+-        goto fail;
-+-    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+-    if (!s->univ_pred_cmds)
-+-        goto fail;
-++    for(job=0;job<RPI_MAX_JOBS;job++) {
-++        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-++        if (!s->unif_mv_cmds[job])
-++            goto fail;
-++        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-++        if (!s->univ_pred_cmds[job])
-++            goto fail;
-++    }
-+ 
-+ #ifdef RPI_INTER_QPU
-+     // We divide the image into blocks 256 wide and 64 high
-+@@ -4282,18 +4523,20 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     {
-+         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+         uint32_t *p;
-++		for(job=0;job<RPI_MAX_JOBS;job++) {
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
-+ #else
-+-        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
-+ #endif
-+-        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-++          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
-+ 
-+-        // Set up initial locations for uniform streams
-+-        p = s->unif_mvs;
-+-        for(i = 0; i < 8; i++) {
-+-            s->mvs_base[i] = p;
-++          // Set up initial locations for uniform streams
-++          p = s->unif_mvs[job];
-++          for(i = 0; i < 8; i++) {
-++            s->mvs_base[job][i] = p;
-+             p += uv_commands_per_qpu;
-++          }
-+         }
-+         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-+         s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-+@@ -4302,61 +4545,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     }
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-++    for(job=0;job<RPI_MAX_JOBS;job++)
-+     {
-+         int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-+         uint32_t *p;
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-++        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+ #else
-+-        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-++        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+ #endif
-+-        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-++        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
-+ 
-+         // Set up initial locations for uniform streams
-+-        p = s->y_unif_mvs;
-++        p = s->y_unif_mvs[job];
-+         for(i = 0; i < 12; i++) {
-+-            s->y_mvs_base[i] = p;
-++            s->y_mvs_base[job][i] = p;
-+             p += y_commands_per_qpu;
-+         }
-+-        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-+-        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
-+-
-+     }
-++    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-++    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
-+ #endif
-+     //gpu_malloc_uncached(2048*64,&s->dummy);
-+ 
-+-#ifdef EARLY_MALLOC
-+-    {
-+-        int coeffs_in_ctb = 64*64;
-+-        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-+-        s->coeffs_buf_arm[0] = 0;
-+-        s->coeffs_buf_arm[2] = 0;
-+-        printf("Allocated %d\n",coefs_per_row);
-+-        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+-        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+-        if (!s->coeffs_buf_arm[0])
-+-            goto fail;
-+-        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+-        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+-        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+-        if (!s->coeffs_buf_arm[2])
-+-            goto fail;
-+-        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+-        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+-        printf("Done\n");
-+-#ifdef RPI_PRECLEAR
-+-        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-+-        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-+-        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-+-        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-+-        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-+-        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-+-#endif
-+-    }
-+-#endif
-+-
-+     s->enable_rpi = 0;
-+ 
-++#ifdef RPI_WORKER
-++    hevc_init_worker(s);
-++#endif
-++
-+ #endif
-+ 
-+     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index aa4d218..8d72344 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -50,6 +50,12 @@
-+     // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-+     #define RPI_LUMA_QPU
-+   #endif
-++
-++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-++  #define RPI_MAX_JOBS 2
-++  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-++  #define RPI_WORKER
-++
-+ #endif
-+ 
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+@@ -832,6 +838,13 @@ typedef struct HEVCLocalContext {
-+     int boundary_flags;
-+ } HEVCLocalContext;
-+ 
-++#ifdef RPI_WORKER
-++typedef struct HEVCLocalContextIntra {
-++    TransformUnit tu;
-++    NeighbourAvailable na;
-++} HEVCLocalContextIntra;
-++#endif
-++
-+ #ifdef RPI
-+ 
-+ // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+@@ -900,7 +913,7 @@ typedef struct HEVCPredCmd {
-+ 
-+ typedef struct HEVCContext {
-+ #ifdef RPI
-+-    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
-++    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+ #endif
-+     const AVClass *c;  // needed by private avoptions
-+     AVCodecContext *avctx;
-+@@ -909,7 +922,9 @@ typedef struct HEVCContext {
-+ 
-+     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
-+     HEVCLocalContext    *HEVClc;
-+-
-++#ifdef RPI_WORKER
-++    HEVCLocalContextIntra HEVClcIntra;
-++#endif
-+     uint8_t             threads_type;
-+     uint8_t             threads_number;
-+ 
-+@@ -920,43 +935,60 @@ typedef struct HEVCContext {
-+ 
-+ #ifdef RPI
-+     int enable_rpi;
-+-    HEVCMvCmd *unif_mv_cmds;
-+-    HEVCPredCmd *univ_pred_cmds;
-++    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
-++    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
-+     int buf_width;
-+-    GPU_MEM_PTR_T coeffs_buf_default;
-+-    GPU_MEM_PTR_T coeffs_buf_accelerated;
-+-    int16_t *coeffs_buf_arm[4];
-+-    unsigned int coeffs_buf_vc[4];
-+-    int num_coeffs[4];
-+-    int num_xfm_cmds;
-+-    int num_mv_cmds;
-+-    int num_pred_cmds;
-+-    int num_dblk_cmds;
-++    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
-++    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
-++    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
-++    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
-++    int num_coeffs[RPI_MAX_JOBS][4];
-++    int num_xfm_cmds[RPI_MAX_JOBS];
-++    int num_mv_cmds[RPI_MAX_JOBS];
-++    int num_pred_cmds[RPI_MAX_JOBS];
-++    int num_dblk_cmds[RPI_MAX_JOBS];
-+     int vpu_id;
-+     //GPU_MEM_PTR_T dummy;
-++    int pass0_job; // Pass0 does coefficient decode
-++    int pass1_job; // Pass1 does pixel processing
-++    int pass2_job; // Pass2 does reconstruction and deblocking
-+ #ifdef RPI_INTER_QPU
-+-    GPU_MEM_PTR_T unif_mvs_ptr;
-+-    uint32_t *unif_mvs; // Base of memory for motion vector commands
-++    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-++    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+ 
-+     // _base pointers are to the start of the row
-+-    uint32_t *mvs_base[8];
-++    uint32_t *mvs_base[RPI_MAX_JOBS][8];
-+     // these pointers are to the next free space
-+-    uint32_t *u_mvs[8];
-++    uint32_t *u_mvs[RPI_MAX_JOBS][8];
-+     // Function pointers
-+     uint32_t mc_filter_uv;
-+     uint32_t mc_filter_uv_b0;
-+     uint32_t mc_filter_uv_b;
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-+-    GPU_MEM_PTR_T y_unif_mvs_ptr;
-+-    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
-+-    uint32_t *y_mvs_base[12];
-+-    uint32_t *y_mvs[12];
-++    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
-++    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-++    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-++    uint32_t *y_mvs[RPI_MAX_JOBS][12];
-+     // Function pointers
-+     uint32_t mc_filter;
-+     uint32_t mc_filter_b;
-+ #endif
-+ 
-++#ifdef RPI_WORKER
-++    pthread_t worker_thread;
-++    pthread_t worker_deblock_thread;
-++    pthread_cond_t worker_cond_head;
-++    pthread_cond_t worker_cond_tail;
-++    pthread_cond_t worker_cond_middle;
-++    pthread_mutex_t worker_mutex;
-++
-++    int worker_tail; // Contains the number of posted jobs
-++    int worker_head; // Contains the number of completed jobs
-++    int worker_middle; // Contains the number of completed jobs
-++    int kill_worker; // set to 1 to terminate the worker
-++#endif
-++
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index ca76cb0..b9f773b 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1051,11 +1051,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     if (s->enable_rpi) {
-+         int n = trafo_size * trafo_size;
-+         if (use_vpu) {
-+-            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
-+-            s->num_coeffs[log2_trafo_size - 2] += n;
-++            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-++            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-+         } else {
-+-            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
-+-            s->num_coeffs[0] += n;
-++            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-++            s->num_coeffs[s->pass0_job][0] += n;
-+         }
-+     }
-+     // We now do the memset after transform_add while we know the data is cached.
-+@@ -1508,7 +1508,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+             }
-+         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-+-            s->hevcdsp.idct_4x4_luma(coeffs);
-++           s->hevcdsp.idct_4x4_luma(coeffs);
-+         } else {
-+ #ifdef RPI
-+             if (!use_vpu) {
-+@@ -1553,7 +1553,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     }
-+ #ifdef RPI
-+     if (s->enable_rpi) {
-+-        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+         cmd->type = RPI_PRED_TRANSFORM_ADD;
-+         cmd->size = log2_trafo_size;
-+         cmd->buf = coeffs;
-+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-+index 71c6d52..344e021 100644
-+--- a/libavcodec/hevcpred_template.c
-++++ b/libavcodec/hevcpred_template.c
-+@@ -71,8 +71,11 @@ do {                                  \
-+                 AV_WN4P(&ptr[i], a);                                           \
-+             else                                                               \
-+                 a = PIXEL_SPLAT_X4(ptr[i + 3])
-+-
-++#ifdef RPI_WORKER
-++    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-++#else
-+     HEVCLocalContext *lc = s->HEVClc;
-++#endif
-+     int i;
-+     int hshift = s->ps.sps->hshift[c_idx];
-+     int vshift = s->ps.sps->vshift[c_idx];
-+-- 
-+2.5.0
-+
-+
-+From e3604dee43bae2083ecea8b578da9878a7877f1f Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 13:43:48 +0100
-+Subject: [PATCH 54/68] Avoid lockup bug with RPI_WORKER enabled
-+
-+---
-+ libavcodec/hevc.c       | 22 +++++++++++-----------
-+ libavcodec/hevc_cabac.c |  1 -
-+ 2 files changed, 11 insertions(+), 12 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 594340a..323d5f9 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -131,11 +131,11 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
-+ static void worker_submit_job(HEVCContext *s)
-+ {
-+   LOG_ENTER
-+-  //pthread_mutex_lock(&s->worker_mutex);
-+-  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
-++  pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_tail++;
-+   s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+   pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-+-  //pthread_mutex_unlock(&s->worker_mutex);
-++  pthread_mutex_unlock(&s->worker_mutex);
-+   LOG_EXIT
-+ }
-+ 
-+@@ -143,11 +143,11 @@ static void worker_submit_job(HEVCContext *s)
-+ static void worker_complete_middle_job(HEVCContext *s)
-+ {
-+   LOG_ENTER
-+-  //pthread_mutex_lock(&s->worker_mutex);
-+-  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
-++  pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_middle++;
-+   s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+-  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
-+-  //pthread_mutex_unlock(&s->worker_mutex);
-++  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
-++  pthread_mutex_unlock(&s->worker_mutex);
-+   LOG_EXIT
-+ }
-+ 
-+@@ -155,11 +155,11 @@ static void worker_complete_middle_job(HEVCContext *s)
-+ static void worker_complete_job(HEVCContext *s)
-+ {
-+   LOG_ENTER
-+-  //pthread_mutex_lock(&s->worker_mutex);
-+-  s->worker_head++; // This is the only place that can change head so we do not need the mutex
-++  pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_head++;
-+   s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+-  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
-+-  //pthread_mutex_unlock(&s->worker_mutex);
-++  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-++  pthread_mutex_unlock(&s->worker_mutex);
-+   LOG_EXIT
-+ }
-+ 
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index b9f773b..16e7ac3 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1497,7 +1497,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                 for (i = 0; i < 8; i++)
-+                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-+             }
-+-
-+             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
-+ 
-+             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+-- 
-+2.5.0
-+
-+
-+From 38c4114cd6d6335dde5a4ba86f0f6e1a1529ad6a Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 15:37:19 +0100
-+Subject: [PATCH 55/68] Added code to flush buffers at start of frame
-+
-+---
-+ libavcodec/hevc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
-+ 1 file changed, 72 insertions(+)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 323d5f9..aa72f97 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -41,6 +41,7 @@
-+ 
-+ #ifdef RPI
-+   #include "rpi_qpu.h"
-++  #include "rpi_user_vcsm.h"
-+   // Move Inter prediction into separate pass
-+   #define RPI_INTER
-+ 
-+@@ -3495,6 +3496,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ #else
-+     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+ #endif
-++
-+     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+@@ -3545,6 +3547,71 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ }
-+ #endif
-+ 
-++#ifdef RPI
-++
-++static void flush_buffer(AVBufferRef *bref) {
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++    gpu_cache_flush(p);
-++}
-++
-++static void flush_frame(HEVCContext *s,AVFrame *frame)
-++{
-++#if 1
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    int n = s->ps.sps->height;
-++    int curr_y = 0;
-++    int curr_uv = 0;
-++    int n_uv = n >> s->ps.sps->vshift[1];
-++    int sz,base;
-++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-++    base = s->frame->linesize[1] * curr_uv;
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-++    iocache.s[0].handle = p->vcsm_handle;
-++    iocache.s[0].cmd = 3; // clean+invalidate
-++    iocache.s[0].addr = p->arm + base;
-++    iocache.s[0].size  = sz;
-++    p = av_buffer_pool_opaque(frame->buf[2]);
-++    iocache.s[1].handle = p->vcsm_handle;
-++    iocache.s[1].cmd = 3; // clean+invalidate
-++    iocache.s[1].addr = p->arm + base;
-++    iocache.s[1].size  = sz;
-++    p = av_buffer_pool_opaque(frame->buf[0]);
-++    sz = s->frame->linesize[0] * (n-curr_y);
-++    base = s->frame->linesize[0] * curr_y;
-++    iocache.s[2].handle = p->vcsm_handle;
-++    iocache.s[2].cmd = 3; // clean+invalidate
-++    iocache.s[2].addr = p->arm + base;
-++    iocache.s[2].size  = sz;
-++    vcsm_clean_invalid( &iocache );
-++#else
-++    flush_buffer(frame->buf[0]);
-++    flush_buffer(frame->buf[1]);
-++    flush_buffer(frame->buf[2]);
-++#endif
-++}
-++
-++static void flush_all(HEVCContext *s)
-++{
-++#if 0
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
-++    iocache.s[0].handle = p->vcsm_handle;
-++    iocache.s[0].cmd = 4; // Flush all
-++    iocache.s[0].addr = p->arm;
-++    iocache.s[0].size  = 4096;
-++    vcsm_clean_invalid( &iocache );
-++#else
-++  int i,k;
-++  for(i=0;i<2;i++) {
-++    for (k = 0; k < s->sh.nb_refs[i]; k++) {
-++      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
-++    }
-++  }
-++  flush_frame(s,s->frame);
-++#endif
-++}
-++#endif
-++
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ {
-+     HEVCContext *s  = avctxt->priv_data;
-+@@ -3579,8 +3646,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         printf("Weighted B slice\n");
-+     }
-+ 
-++    // Now flush all reference frames and our destination frame to get everything ready for decode
-++    flush_all(s);
-+ #endif
-+ 
-++    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-++
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+         return AVERROR_INVALIDDATA;
-+@@ -3651,6 +3722,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+             rpi_do_all_passes(s);
-+ #endif
-+           }
-++
-+         }
-+ #endif
-+ 
-+-- 
-+2.5.0
-+
-+
-+From b279851bb85b1fe15355603dcd53c3f1b6f06724 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 16:42:24 +0100
-+Subject: [PATCH 56/68] Reduce the amount that needs to be flushed
-+
-+---
-+ libavcodec/hevc.c | 35 +++++++++++------------------------
-+ 1 file changed, 11 insertions(+), 24 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index aa72f97..a2ba177 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3556,7 +3556,7 @@ static void flush_buffer(AVBufferRef *bref) {
-+ 
-+ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ {
-+-#if 1
-++#ifdef RPI_FAST_CACHEFLUSH
-+     struct vcsm_user_clean_invalid_s iocache = {};
-+     int n = s->ps.sps->height;
-+     int curr_y = 0;
-+@@ -3590,26 +3590,6 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ #endif
-+ }
-+ 
-+-static void flush_all(HEVCContext *s)
-+-{
-+-#if 0
-+-    struct vcsm_user_clean_invalid_s iocache = {};
-+-    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
-+-    iocache.s[0].handle = p->vcsm_handle;
-+-    iocache.s[0].cmd = 4; // Flush all
-+-    iocache.s[0].addr = p->arm;
-+-    iocache.s[0].size  = 4096;
-+-    vcsm_clean_invalid( &iocache );
-+-#else
-+-  int i,k;
-+-  for(i=0;i<2;i++) {
-+-    for (k = 0; k < s->sh.nb_refs[i]; k++) {
-+-      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
-+-    }
-+-  }
-+-  flush_frame(s,s->frame);
-+-#endif
-+-}
-+ #endif
-+ 
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+@@ -3645,9 +3625,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-+         printf("Weighted B slice\n");
-+     }
-+-
-+-    // Now flush all reference frames and our destination frame to get everything ready for decode
-+-    flush_all(s);
-+ #endif
-+ 
-+     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-+@@ -4053,6 +4030,11 @@ static int hevc_frame_start(HEVCContext *s)
-+     if (!s->avctx->hwaccel)
-+         ff_thread_finish_setup(s->avctx);
-+ 
-++#ifdef RPI_INTER_QPU
-++    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
-++    flush_frame(s,s->frame);
-++#endif
-++
-+     return 0;
-+ 
-+ fail:
-+@@ -4254,6 +4236,11 @@ fail:
-+         ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-++    } else if (s->ref) {
-++#ifdef RPI_INTER_QPU
-++      // When running single threaded we need to flush the whole frame
-++      flush_frame(s,s->frame);
-++#endif
-+     }
-+     return ret;
-+ }
-+-- 
-+2.5.0
-+
-+
-+From 7475c16d1b6b4ce94bb65f42bf3ae26969d4abf4 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 4 Jun 2015 07:59:28 +0100
-+Subject: [PATCH 57/68] Corrected support for disabled rpi when using
-+ RPI_WORKER
-+
-+---
-+ libavcodec/hevc.h              | 18 ++++++++++--------
-+ libavcodec/hevcpred_template.c |  2 +-
-+ 2 files changed, 11 insertions(+), 9 deletions(-)
-+
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 8d72344..83b0e58 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -795,7 +795,17 @@ typedef struct HEVCPacket {
-+     int nals_allocated;
-+ } HEVCPacket;
-+ 
-++#ifdef RPI_WORKER
-++typedef struct HEVCLocalContextIntra {
-++    TransformUnit tu;
-++    NeighbourAvailable na;
-++} HEVCLocalContextIntra;
-++#endif
-++
-+ typedef struct HEVCLocalContext {
-++    TransformUnit tu;
-++    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
-++
-+     uint8_t cabac_state[HEVC_CONTEXTS];
-+ 
-+     uint8_t stat_coeff[4];
-+@@ -810,7 +820,6 @@ typedef struct HEVCLocalContext {
-+ 
-+     int qPy_pred;
-+ 
-+-    TransformUnit tu;
-+ 
-+     uint8_t ctb_left_flag;
-+     uint8_t ctb_up_flag;
-+@@ -827,7 +836,6 @@ typedef struct HEVCLocalContext {
-+     int ct_depth;
-+     CodingUnit cu;
-+     PredictionUnit pu;
-+-    NeighbourAvailable na;
-+ 
-+ #define BOUNDARY_LEFT_SLICE     (1 << 0)
-+ #define BOUNDARY_LEFT_TILE      (1 << 1)
-+@@ -838,12 +846,6 @@ typedef struct HEVCLocalContext {
-+     int boundary_flags;
-+ } HEVCLocalContext;
-+ 
-+-#ifdef RPI_WORKER
-+-typedef struct HEVCLocalContextIntra {
-+-    TransformUnit tu;
-+-    NeighbourAvailable na;
-+-} HEVCLocalContextIntra;
-+-#endif
-+ 
-+ #ifdef RPI
-+ 
-+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-+index 344e021..325b60e 100644
-+--- a/libavcodec/hevcpred_template.c
-++++ b/libavcodec/hevcpred_template.c
-+@@ -72,7 +72,7 @@ do {                                  \
-+             else                                                               \
-+                 a = PIXEL_SPLAT_X4(ptr[i + 3])
-+ #ifdef RPI_WORKER
-+-    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-++    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
-+ #else
-+     HEVCLocalContext *lc = s->HEVClc;
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From 665b1e12a132f7ea798472d46200ad930abe2a82 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 4 Jun 2015 11:52:55 +0100
-+Subject: [PATCH 58/68] Draft support for tiles
-+
-+---
-+ libavcodec/hevc.c              | 140 +++++++++++++++++++++++------------------
-+ libavcodec/hevc.h              |  22 +++++--
-+ libavcodec/hevc_filter.c       |   2 +-
-+ libavcodec/hevcpred_template.c |   2 +-
-+ 4 files changed, 100 insertions(+), 66 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index a2ba177..f3f5fdb 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -61,10 +61,10 @@
-+ 
-+   static void rpi_execute_dblk_cmds(HEVCContext *s);
-+   static void rpi_execute_transform(HEVCContext *s);
-+-  static void rpi_execute_inter_qpu(HEVCContext *s);
-++  static void rpi_launch_vpu_qpu(HEVCContext *s);
-+   static void rpi_execute_pred_cmds(HEVCContext *s);
-+   static void rpi_execute_inter_cmds(HEVCContext *s);
-+-  static void rpi_inter_clear(HEVCContext *s);
-++  static void rpi_begin(HEVCContext *s);
-+ 
-+   // Define INTER_PASS0 to do inter prediction in first pass
-+   //#define INTER_PASS0
-+@@ -88,16 +88,18 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ 
-+ #ifdef RPI_INTER_QPU
-+ 
-++// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
-++// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
-++// For each block of 64*64 the smallest block size is 8x4
-++// We also need an extra command for the setup information
-++
-+ #define RPI_CHROMA_COMMAND_WORDS 12
-+-#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-++#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+ 
-+-// Split image of 2048 into parts 64 wide
-+-// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
-+-// For each block of 64*64 the smallest block size is 8x4
-+ #define RPI_LUMA_COMMAND_WORDS 9
-+-#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-++#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-+ 
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+@@ -214,7 +216,7 @@ static void *worker_start(void *arg)
-+     LOG_ENTER
-+     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+ #ifndef LAUNCH_PASS0
-+-    rpi_execute_inter_qpu(s);
-++    rpi_launch_vpu_qpu(s);
-+ #endif
-+ #ifndef INTER_PASS0
-+     // Perform inter prediction
-+@@ -320,9 +322,14 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+ 
-+ #ifdef RPI
-+     av_assert0(sps);
-+-    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+-    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-++    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-++    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-++    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-++    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-+     int job;
-++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-++    s->ctu_per_y_chan = s->max_ctu_count / 12;
-++    s->ctu_per_uv_chan = s->max_ctu_count / 8;
-+     for(job=0;job<RPI_MAX_JOBS;job++) {
-+       printf("Allocated %d\n",coefs_per_row);
-+       for(job=0;job<RPI_MAX_JOBS;job++) {
-+@@ -2173,10 +2180,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+             int x1 = x0 + (mv->x >> 2);
-+             int y1 = y0 + (mv->y >> 2);
-+-            int chan = x0>>6; // 64 wide blocks per QPU
-+             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+-            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-++            uint32_t *y = s->curr_y_mvs;
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                   int bw = nPbW-start_x;
-+@@ -2196,7 +2202,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                 }
-+             }
-+-            s->y_mvs[s->pass0_job][chan % 12] = y;
-++            s->curr_y_mvs = y;
-+         } else
-+ #endif
-+         {
-+@@ -2220,12 +2226,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+ 
-+                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+-                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+-                int chan = x0>>8;
-+                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+-                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-++                uint32_t *u = s->curr_u_mvs;
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2249,7 +2253,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[s->pass0_job][chan & 7] = u;
-++                s->curr_u_mvs = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2276,10 +2280,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+             int x1 = x0 + (mv->x >> 2);
-+             int y1 = y0 + (mv->y >> 2);
-+-            int chan = x0>>6; // 64 wide blocks per QPU
-+             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+-            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-++            uint32_t *y = s->curr_y_mvs;
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                   int bw = nPbW-start_x;
-+@@ -2299,7 +2302,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                 }
-+             }
-+-            s->y_mvs[s->pass0_job][chan % 12] = y;
-++            s->curr_y_mvs = y;
-+         } else
-+ #endif
-+ 
-+@@ -2324,12 +2327,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+ 
-+                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+-                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+-                int chan = x0>>8;
-+                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+-                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-++                uint32_t *u = s->curr_u_mvs;
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2354,7 +2355,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[s->pass0_job][chan & 7] = u;
-++                s->curr_u_mvs = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2387,8 +2388,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int y1 = y0 + (mv->y >> 2);
-+             int x2 = x0 + (mv2->x >> 2);
-+             int y2 = y0 + (mv2->y >> 2);
-+-            int chan = x0>>6; // 64 wide blocks per QPU
-+-            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-++            uint32_t *y = s->curr_y_mvs;
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                   int bw = nPbW-start_x;
-+@@ -2404,7 +2404,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-+                 }
-+             }
-+-            s->y_mvs[s->pass0_job][chan % 12] = y;
-++            s->curr_y_mvs = y;
-+         } else
-+ #endif
-+         {
-+@@ -2435,9 +2435,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int x2_c = x0_c + (mv2->x >> (2 + hshift));
-+                 int y2_c = y0_c + (mv2->y >> (2 + hshift));
-+ 
-+-                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+ 
-+-                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-++                uint32_t *u = s->curr_u_mvs;
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2466,7 +2465,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[s->pass0_job][chan & 7] = u;
-++                s->curr_u_mvs = u;
-+                 return;
-+             }
-+ #endif
-+@@ -3101,12 +3100,8 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ 
-+ static void rpi_do_all_passes(HEVCContext *s)
-+ {
-+-#ifdef RPI_INTER_QPU
-+-    // Kick off inter prediction on QPUs
-+-    rpi_execute_inter_qpu(s);
-+-#else
-+-    rpi_execute_transform(s);
-+-#endif
-++    // Kick off QPUs and VPUs
-++    rpi_launch_vpu_qpu(s);
-+     // Perform luma inter prediction
-+     rpi_execute_inter_cmds(s);
-+     // Wait for transform completion
-+@@ -3115,18 +3110,18 @@ static void rpi_do_all_passes(HEVCContext *s)
-+     rpi_execute_pred_cmds(s);
-+     // Perform deblocking for CTBs in this row
-+     rpi_execute_dblk_cmds(s);
-+-#ifdef RPI_INTER_QPU
-+-    rpi_inter_clear(s);
-+-#endif
-++    // Prepare next batch
-++    rpi_begin(s);
-+ }
-+ 
-+ #endif
-+ 
-+-#ifdef RPI_INTER_QPU
-+-static void rpi_inter_clear(HEVCContext *s)
-++#ifdef RPI
-++static void rpi_begin(HEVCContext *s)
-+ {
-+     int job = s->pass0_job;
-+     int i;
-++#ifdef RPI_INTER_QPU
-+     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-+     int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+@@ -3152,6 +3147,8 @@ static void rpi_inter_clear(HEVCContext *s)
-+         }
-+         *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+     }
-++    s->curr_u_mvs = s->u_mvs[job][0];
-++#endif
-+ 
-+ #ifdef RPI_LUMA_QPU
-+     for(i=0;i<12;i++) {
-+@@ -3174,8 +3171,11 @@ static void rpi_inter_clear(HEVCContext *s)
-+         }
-+         *s->y_mvs[job][i]++ = 0; // Next kernel
-+     }
-++    s->curr_y_mvs = s->y_mvs[job][0];
-+ #endif
-++    s->ctu_count = 0;
-+ }
-++#endif
-+ 
-+ #ifdef RPI_SIMULATE_QPUS
-+ 
-+@@ -3446,8 +3446,9 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-+ 
-+ #endif
-+ 
-++#ifdef RPI_INTER_QPU
-+ 
-+-static void rpi_execute_inter_qpu(HEVCContext *s)
-++static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ {
-+     int k;
-+ #ifdef LAUNCH_PASS0
-+@@ -3545,6 +3546,15 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+ 
-+ }
-++#else
-++
-++#ifdef RPI
-++static void rpi_launch_vpu_qpu(HEVCContext *s)
-++{
-++  rpi_execute_transform(s);
-++}
-++#endif
-++
-+ #endif
-+ 
-+ #ifdef RPI
-+@@ -3604,29 +3614,20 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI
-+ #ifdef RPI_INTER_QPU
-+     s->enable_rpi = s->ps.sps->bit_depth == 8
-+-                    && s->ps.sps->width <= RPI_MAX_WIDTH
-+                     && !s->ps.pps->cross_component_prediction_enabled_flag
-+-                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
-+                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
-+ #else
-+     s->enable_rpi = s->ps.sps->bit_depth == 8
-+-                    && s->ps.sps->width <= RPI_MAX_WIDTH
-+-                    && !s->ps.pps->cross_component_prediction_enabled_flag
-+-                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
-++                    && !s->ps.pps->cross_component_prediction_enabled_flag;
-+ #endif
-+ 
-+     if (!s->enable_rpi) {
-+       if (s->ps.pps->cross_component_prediction_enabled_flag)
-+         printf("Cross component\n");
-+-      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
-+-        printf("Tiles\n");
-+-      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-+-        printf("Weighted P slice\n");
-+       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-+         printf("Weighted B slice\n");
-+     }
-+ #endif
-+-
-+     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-+ 
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+@@ -3647,8 +3648,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     s->pass1_job = 0;
-+     s->pass2_job = 0;
-+ #endif
-+-#ifdef RPI_INTER_QPU
-+-    rpi_inter_clear(s);
-++#ifdef RPI
-++    rpi_begin(s);
-+ #endif
-+ 
-+     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-+@@ -3666,13 +3667,34 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+ 
-++#ifdef RPI_INTER_QPU
-++        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
-++#endif
-++#ifdef RPI_LUMA_QPU
-++        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
-++#endif
-++
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ 
-++#ifdef RPI_INTER_QPU
-++        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
-++#endif
-++#ifdef RPI_LUMA_QPU
-++        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
-++#endif
-++
-+ #ifdef RPI
-+         if (s->enable_rpi) {
-++          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
-++          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
-++          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
-++          //av_assert0(s->pass0_job>=0);
-+           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-+           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+-          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-++          s->ctu_count++;
-++          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
-++
-++          if ( s->ctu_count >= s->max_ctu_count ) {
-+ #ifdef RPI_WORKER
-+             if (s->used_for_ref) {
-+               // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+@@ -3680,7 +3702,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+               rpi_execute_inter_cmds(s);
-+   #endif
-+   #ifdef LAUNCH_PASS0
-+-              rpi_execute_inter_qpu(s);
-++              rpi_launch_vpu_qpu(s);
-+   #endif
-+               // Pass on this job to worker thread
-+               worker_submit_job(s);
-+@@ -3688,9 +3710,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+               worker_pass0_ready(s);
-+ 
-+               // Prepare the next batch of commands
-+-#ifdef RPI_INTER_QPU
-+-              rpi_inter_clear(s);
-+-#endif
-++              rpi_begin(s);
-+             } else {
-+               // Non-ref frame so do it all on this thread
-+               rpi_do_all_passes(s);
-+@@ -3731,7 +3751,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #endif
-+ 
-+     // Finish off any half-completed rows
-+-    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
-++    if (s->enable_rpi && s->ctu_count) {
-+         rpi_do_all_passes(s);
-+     }
-+ 
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 83b0e58..c62540d 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -849,8 +849,15 @@ typedef struct HEVCLocalContext {
-+ 
-+ #ifdef RPI
-+ 
-++// The processing is done in chunks
-++// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
-++// This is a distance of 1536 pixels across the screen
-++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
-++// but allocate more memory and increase the latency before data in the next frame can be processed
-++#define RPI_NUM_CHUNKS 1
-++
-+ // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+-#define RPI_MAX_WIDTH 2048
-++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
-+ 
-+ // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-+ #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
-+@@ -914,9 +921,6 @@ typedef struct HEVCPredCmd {
-+ #endif
-+ 
-+ typedef struct HEVCContext {
-+-#ifdef RPI
-+-    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+-#endif
-+     const AVClass *c;  // needed by private avoptions
-+     AVCodecContext *avctx;
-+ 
-+@@ -954,6 +958,10 @@ typedef struct HEVCContext {
-+     int pass0_job; // Pass0 does coefficient decode
-+     int pass1_job; // Pass1 does pixel processing
-+     int pass2_job; // Pass2 does reconstruction and deblocking
-++    int ctu_count; // Number of CTUs done in pass0 so far
-++    int max_ctu_count; // Number of CTUs when we trigger a round of processing
-++    int ctu_per_y_chan; // Number of CTUs per luma QPU
-++    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
-+ #ifdef RPI_INTER_QPU
-+     GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-+     uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+@@ -962,6 +970,7 @@ typedef struct HEVCContext {
-+     uint32_t *mvs_base[RPI_MAX_JOBS][8];
-+     // these pointers are to the next free space
-+     uint32_t *u_mvs[RPI_MAX_JOBS][8];
-++    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
-+     // Function pointers
-+     uint32_t mc_filter_uv;
-+     uint32_t mc_filter_uv_b0;
-+@@ -972,6 +981,7 @@ typedef struct HEVCContext {
-+     uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+     uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-+     uint32_t *y_mvs[RPI_MAX_JOBS][12];
-++    uint32_t *curr_y_mvs; // Current uniform stream for luma
-+     // Function pointers
-+     uint32_t mc_filter;
-+     uint32_t mc_filter_b;
-+@@ -1099,6 +1109,10 @@ typedef struct HEVCContext {
-+     int sei_hflip, sei_vflip;
-+ 
-+     int picture_struct;
-++
-++#ifdef RPI
-++    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-++#endif
-+ } HEVCContext;
-+ 
-+ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index b286bbf..1f04790 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -891,7 +891,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+         int n_uv = n >> s->ps.sps->vshift[1];
-+         int sz,base;
-+         if (curr_uv < 0) curr_uv = 0;
-+-        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
-++        if (n_uv<=curr_uv) { return; }
-+         sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+         base = s->frame->linesize[1] * curr_uv;
-+         GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-+index 325b60e..28d2653 100644
-+--- a/libavcodec/hevcpred_template.c
-++++ b/libavcodec/hevcpred_template.c
-+@@ -72,7 +72,7 @@ do {                                  \
-+             else                                                               \
-+                 a = PIXEL_SPLAT_X4(ptr[i + 3])
-+ #ifdef RPI_WORKER
-+-    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
-++    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
-+ #else
-+     HEVCLocalContext *lc = s->HEVClc;
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From e8bf19f86fefd76f1f48d7b96bb47ec23c2802fc Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 4 Jun 2015 15:48:10 +0100
-+Subject: [PATCH 59/68] Move deblocker into second pass
-+
-+---
-+ libavcodec/hevc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++----------
-+ 1 file changed, 65 insertions(+), 14 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index f3f5fdb..bd59f02 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -65,6 +65,8 @@
-+   static void rpi_execute_pred_cmds(HEVCContext *s);
-+   static void rpi_execute_inter_cmds(HEVCContext *s);
-+   static void rpi_begin(HEVCContext *s);
-++  static void flush_frame(HEVCContext *s,AVFrame *frame);
-++  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-+ 
-+   // Define INTER_PASS0 to do inter prediction in first pass
-+   //#define INTER_PASS0
-+@@ -225,6 +227,11 @@ static void *worker_start(void *arg)
-+     // Wait for transform completion
-+     vpu_wait(s->vpu_id);
-+ 
-++    // Perform intra prediction and residual reconstruction
-++    rpi_execute_pred_cmds(s);
-++    // Perform deblocking for CTBs in this row
-++    rpi_execute_dblk_cmds(s);
-++
-+     worker_complete_middle_job(s);
-+     LOG_EXIT
-+   }
-+@@ -246,10 +253,6 @@ static void *worker_deblock_start(void *arg)
-+       break;
-+     }
-+     LOG_ENTER
-+-    // Perform intra prediction and residual reconstruction
-+-    rpi_execute_pred_cmds(s);
-+-    // Perform deblocking for CTBs in this row
-+-    rpi_execute_dblk_cmds(s);
-+ 
-+     worker_complete_job(s);
-+     LOG_EXIT
-+@@ -2970,7 +2973,7 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+ static void rpi_execute_dblk_cmds(HEVCContext *s)
-+ {
-+     int n;
-+-    int job = s->pass2_job;
-++    int job = s->pass1_job;
-+     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+     int (*p)[2] = s->dblk_cmds[job];
-+     for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-+@@ -3008,7 +3011,7 @@ static void rpi_execute_transform(HEVCContext *s)
-+ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ {
-+   int i;
-+-  int job = s->pass2_job;
-++  int job = s->pass1_job;
-+   HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-+ #ifdef RPI_WORKER
-+   HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-+@@ -3493,11 +3496,10 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI_MULTI_MAILBOX
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-+ #else
-+-    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
-+ #endif
-+-
-+     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+@@ -3600,6 +3602,60 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ #endif
-+ }
-+ 
-++static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-++{
-++#ifdef RPI_FAST_CACHEFLUSH
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    int n = s->ps.sps->height;
-++    int curr_y = 0;
-++    int curr_uv = 0;
-++    int n_uv = n >> s->ps.sps->vshift[1];
-++    int sz,base;
-++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-++    base = s->frame->linesize[1] * curr_uv;
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-++    iocache.s[0].handle = p->vcsm_handle;
-++    iocache.s[0].cmd = 3; // clean+invalidate
-++    iocache.s[0].addr = p->arm + base;
-++    iocache.s[0].size  = sz;
-++    p = av_buffer_pool_opaque(frame->buf[2]);
-++    iocache.s[1].handle = p->vcsm_handle;
-++    iocache.s[1].cmd = 3; // clean+invalidate
-++    iocache.s[1].addr = p->arm + base;
-++    iocache.s[1].size  = sz;
-++    p = av_buffer_pool_opaque(frame->buf[0]);
-++    sz = s->frame->linesize[0] * (n-curr_y);
-++    base = s->frame->linesize[0] * curr_y;
-++    iocache.s[2].handle = p->vcsm_handle;
-++    iocache.s[2].cmd = 3; // clean+invalidate
-++    iocache.s[2].addr = p->arm + base;
-++    iocache.s[2].size  = sz;
-++
-++    iocache.s[3].handle = p0->vcsm_handle;
-++    iocache.s[3].cmd = 3; // clean+invalidate
-++    iocache.s[3].addr = (int) p0->arm;
-++    iocache.s[3].size  = p0->numbytes;
-++    if (p1) {
-++      iocache.s[4].handle = p1->vcsm_handle;
-++      iocache.s[4].cmd = 3; // clean+invalidate
-++      iocache.s[4].addr = (int) p1->arm;
-++      iocache.s[4].size  = p1->numbytes;
-++    }
-++    if (p2) {
-++      iocache.s[5].handle = p2->vcsm_handle;
-++      iocache.s[5].cmd = 3; // clean+invalidate
-++      iocache.s[5].addr = (int) p2->arm;
-++      iocache.s[5].size  = p2->numbytes;
-++    }
-++    vcsm_clean_invalid( &iocache );
-++#else
-++    flush_buffer(frame->buf[0]);
-++    flush_buffer(frame->buf[1]);
-++    flush_buffer(frame->buf[2]);
-++    gpu_cache_flush3(p0, p1, p2);
-++#endif
-++}
-++
-+ #endif
-+ 
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+@@ -4050,11 +4106,6 @@ static int hevc_frame_start(HEVCContext *s)
-+     if (!s->avctx->hwaccel)
-+         ff_thread_finish_setup(s->avctx);
-+ 
-+-#ifdef RPI_INTER_QPU
-+-    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
-+-    flush_frame(s,s->frame);
-+-#endif
-+-
-+     return 0;
-+ 
-+ fail:
-+-- 
-+2.5.0
-+
-+
-+From bd42b24c8f7e1f0d2bcfa476d2e1aea20aa3723e Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 4 Jun 2015 16:10:23 +0100
-+Subject: [PATCH 60/68] Change order of ctu accesses to improve qpu performance
-+
-+---
-+ libavcodec/hevc.c | 8 ++++----
-+ 1 file changed, 4 insertions(+), 4 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index bd59f02..ff93f6c 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3724,19 +3724,19 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+ 
-+ #ifdef RPI_INTER_QPU
-+-        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
-++        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-+-        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
-++        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
-+ #endif
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ 
-+ #ifdef RPI_INTER_QPU
-+-        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
-++        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-+-        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
-++        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
-+ #endif
-+ 
-+ #ifdef RPI
-+-- 
-+2.5.0
-+
-+
-+From 3ba78b5fe86fccfb132068603ad1db87ce44ab6c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 8 Jun 2015 09:36:59 +0100
-+Subject: [PATCH 61/68] Removed deblocker thread
-+
-+---
-+ libavcodec/hevc.c | 77 +++----------------------------------------------------
-+ libavcodec/hevc.h |  4 ---
-+ 2 files changed, 4 insertions(+), 77 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index ff93f6c..43f7ce5 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -68,11 +68,6 @@
-+   static void flush_frame(HEVCContext *s,AVFrame *frame);
-+   static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-+ 
-+-  // Define INTER_PASS0 to do inter prediction in first pass
-+-  //#define INTER_PASS0
-+-  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
-+-  //#define LAUNCH_PASS0
-+-
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -145,24 +140,12 @@ static void worker_submit_job(HEVCContext *s)
-+ }
-+ 
-+ // Call this to say we have completed pass1
-+-static void worker_complete_middle_job(HEVCContext *s)
-+-{
-+-  LOG_ENTER
-+-  pthread_mutex_lock(&s->worker_mutex);
-+-  s->worker_middle++;
-+-  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+-  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
-+-  pthread_mutex_unlock(&s->worker_mutex);
-+-  LOG_EXIT
-+-}
-+-
-+-// Call this to say we have completed pass2
-+ static void worker_complete_job(HEVCContext *s)
-+ {
-+   LOG_ENTER
-+   pthread_mutex_lock(&s->worker_mutex);
-+   s->worker_head++;
-+-  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-++  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+   pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-+   pthread_mutex_unlock(&s->worker_mutex);
-+   LOG_EXIT
-+@@ -206,7 +189,7 @@ static void *worker_start(void *arg)
-+   while(1) {
-+     pthread_mutex_lock(&s->worker_mutex);
-+ 
-+-    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
-++    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
-+     {
-+       pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-+     }
-+@@ -217,13 +200,9 @@ static void *worker_start(void *arg)
-+     }
-+     LOG_ENTER
-+     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+-#ifndef LAUNCH_PASS0
-+     rpi_launch_vpu_qpu(s);
-+-#endif
-+-#ifndef INTER_PASS0
-+     // Perform inter prediction
-+     rpi_execute_inter_cmds(s);
-+-#endif
-+     // Wait for transform completion
-+     vpu_wait(s->vpu_id);
-+ 
-+@@ -232,28 +211,6 @@ static void *worker_start(void *arg)
-+     // Perform deblocking for CTBs in this row
-+     rpi_execute_dblk_cmds(s);
-+ 
-+-    worker_complete_middle_job(s);
-+-    LOG_EXIT
-+-  }
-+-  return NULL;
-+-}
-+-
-+-static void *worker_deblock_start(void *arg)
-+-{
-+-  HEVCContext *s = (HEVCContext *)arg;
-+-  while(1) {
-+-    pthread_mutex_lock(&s->worker_mutex);
-+-    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
-+-    {
-+-      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
-+-    }
-+-    pthread_mutex_unlock(&s->worker_mutex);
-+-
-+-    if (s->kill_worker) {
-+-      break;
-+-    }
-+-    LOG_ENTER
-+-
-+     worker_complete_job(s);
-+     LOG_EXIT
-+   }
-+@@ -2985,11 +2942,7 @@ static void rpi_execute_dblk_cmds(HEVCContext *s)
-+ static void rpi_execute_transform(HEVCContext *s)
-+ {
-+     int i=2;
-+-#ifdef LAUNCH_PASS0
-+-    int job = s->pass0_job;
-+-#else
-+     int job = s->pass1_job;
-+-#endif
-+     //int j;
-+     //int16_t *coeffs = s->coeffs_buf_arm[i];
-+     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+@@ -3044,11 +2997,7 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ 
-+ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ {
-+-#ifdef INTER_PASS0
-+-    int job = s->pass0_job;
-+-#else
-+     int job = s->pass1_job;
-+-#endif
-+     HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-+     int n,cidx;
-+     AVFrame myref;
-+@@ -3454,11 +3403,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-+ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ {
-+     int k;
-+-#ifdef LAUNCH_PASS0
-+-    int job = s->pass0_job;
-+-#else
-+     int job = s->pass1_job;
-+-#endif
-+     int i;
-+     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
-+ #ifdef RPI_LUMA_QPU
-+@@ -3561,10 +3506,12 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI
-+ 
-++#ifndef RPI_FAST_CACHEFLUSH
-+ static void flush_buffer(AVBufferRef *bref) {
-+     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+     gpu_cache_flush(p);
-+ }
-++#endif
-+ 
-+ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ {
-+@@ -3702,7 +3649,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI_WORKER
-+     s->pass0_job = 0;
-+     s->pass1_job = 0;
-+-    s->pass2_job = 0;
-+ #endif
-+ #ifdef RPI
-+     rpi_begin(s);
-+@@ -3754,12 +3700,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI_WORKER
-+             if (s->used_for_ref) {
-+               // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+-  #ifdef INTER_PASS0
-+-              rpi_execute_inter_cmds(s);
-+-  #endif
-+-  #ifdef LAUNCH_PASS0
-+-              rpi_launch_vpu_qpu(s);
-+-  #endif
-+               // Pass on this job to worker thread
-+               worker_submit_job(s);
-+               // Make sure we have space to prepare the next job
-+@@ -3801,8 +3741,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     // Wait for the worker to finish all its jobs
-+     if (s->enable_rpi) {
-+         worker_wait(s);
-+-        av_assert0(s->pass0_job==s->pass1_job);
-+-        av_assert0(s->pass1_job==s->pass2_job);
-+     }
-+ #endif
-+ 
-+@@ -4488,16 +4426,13 @@ static av_cold void hevc_init_worker(HEVCContext *s)
-+ {
-+     int err;
-+     pthread_cond_init(&s->worker_cond_head, NULL);
-+-    pthread_cond_init(&s->worker_cond_middle, NULL);
-+     pthread_cond_init(&s->worker_cond_tail, NULL);
-+     pthread_mutex_init(&s->worker_mutex, NULL);
-+ 
-+     s->worker_tail=0;
-+-    s->worker_middle=0;
-+     s->worker_head=0;
-+     s->kill_worker=0;
-+     err = pthread_create(&s->worker_thread, NULL, worker_start, s);
-+-    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
-+     if (err) {
-+         printf("Failed to create worker thread\n");
-+         exit(-1);
-+@@ -4509,17 +4444,13 @@ static av_cold void hevc_exit_worker(HEVCContext *s)
-+     void *res;
-+     s->kill_worker=1;
-+     pthread_cond_broadcast(&s->worker_cond_tail);
-+-    pthread_cond_broadcast(&s->worker_cond_middle);
-+     pthread_join(s->worker_thread, &res);
-+-    pthread_join(s->worker_deblock_thread, &res);
-+ 
-+     pthread_cond_destroy(&s->worker_cond_head);
-+-    pthread_cond_destroy(&s->worker_cond_middle);
-+     pthread_cond_destroy(&s->worker_cond_tail);
-+     pthread_mutex_destroy(&s->worker_mutex);
-+ 
-+     s->worker_tail=0;
-+-    s->worker_middle=0;
-+     s->worker_head=0;
-+     s->kill_worker=0;
-+ }
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index c62540d..6c0d0b6 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -957,7 +957,6 @@ typedef struct HEVCContext {
-+     //GPU_MEM_PTR_T dummy;
-+     int pass0_job; // Pass0 does coefficient decode
-+     int pass1_job; // Pass1 does pixel processing
-+-    int pass2_job; // Pass2 does reconstruction and deblocking
-+     int ctu_count; // Number of CTUs done in pass0 so far
-+     int max_ctu_count; // Number of CTUs when we trigger a round of processing
-+     int ctu_per_y_chan; // Number of CTUs per luma QPU
-+@@ -989,15 +988,12 @@ typedef struct HEVCContext {
-+ 
-+ #ifdef RPI_WORKER
-+     pthread_t worker_thread;
-+-    pthread_t worker_deblock_thread;
-+     pthread_cond_t worker_cond_head;
-+     pthread_cond_t worker_cond_tail;
-+-    pthread_cond_t worker_cond_middle;
-+     pthread_mutex_t worker_mutex;
-+ 
-+     int worker_tail; // Contains the number of posted jobs
-+     int worker_head; // Contains the number of completed jobs
-+-    int worker_middle; // Contains the number of completed jobs
-+     int kill_worker; // set to 1 to terminate the worker
-+ #endif
-+ 
-+-- 
-+2.5.0
-+
-+
-+From d0720e2a6f21bbdf2ad1d52227ae272db4cf9dc0 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 8 Jun 2015 11:04:43 +0100
-+Subject: [PATCH 62/68] Reduced amount of output frame that is invalidated
-+
-+---
-+ libavcodec/hevc.c | 45 +++++++++++++++++++++++++++++----------------
-+ 1 file changed, 29 insertions(+), 16 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 43f7ce5..ef61788 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -66,7 +66,7 @@
-+   static void rpi_execute_inter_cmds(HEVCContext *s);
-+   static void rpi_begin(HEVCContext *s);
-+   static void flush_frame(HEVCContext *s,AVFrame *frame);
-+-  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-++  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
-+ 
-+ #endif
-+ 
-+@@ -3441,9 +3441,9 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI_MULTI_MAILBOX
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
-+ #else
-+-    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
-++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
-+ #endif
-+     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+@@ -3517,6 +3517,7 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ {
-+ #ifdef RPI_FAST_CACHEFLUSH
-+     struct vcsm_user_clean_invalid_s iocache = {};
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+     int n = s->ps.sps->height;
-+     int curr_y = 0;
-+     int curr_uv = 0;
-+@@ -3524,22 +3525,21 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+     int sz,base;
-+     sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+     base = s->frame->linesize[1] * curr_uv;
-+-    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+     iocache.s[0].handle = p->vcsm_handle;
-+     iocache.s[0].cmd = 3; // clean+invalidate
-+-    iocache.s[0].addr = p->arm + base;
-++    iocache.s[0].addr = (int)(p->arm) + base;
-+     iocache.s[0].size  = sz;
-+     p = av_buffer_pool_opaque(frame->buf[2]);
-+     iocache.s[1].handle = p->vcsm_handle;
-+     iocache.s[1].cmd = 3; // clean+invalidate
-+-    iocache.s[1].addr = p->arm + base;
-++    iocache.s[1].addr = (int)(p->arm) + base;
-+     iocache.s[1].size  = sz;
-+     p = av_buffer_pool_opaque(frame->buf[0]);
-+     sz = s->frame->linesize[0] * (n-curr_y);
-+     base = s->frame->linesize[0] * curr_y;
-+     iocache.s[2].handle = p->vcsm_handle;
-+     iocache.s[2].cmd = 3; // clean+invalidate
-+-    iocache.s[2].addr = p->arm + base;
-++    iocache.s[2].addr = (int)(p->arm) + base;
-+     iocache.s[2].size  = sz;
-+     vcsm_clean_invalid( &iocache );
-+ #else
-+@@ -3549,33 +3549,46 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ #endif
-+ }
-+ 
-+-static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-++static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
-+ {
-+ #ifdef RPI_FAST_CACHEFLUSH
-+     struct vcsm_user_clean_invalid_s iocache = {};
-+-    int n = s->ps.sps->height;
-+-    int curr_y = 0;
-+-    int curr_uv = 0;
-+-    int n_uv = n >> s->ps.sps->vshift[1];
-++    int n;
-++    int curr_y;
-++    int curr_uv;
-++    int n_uv;
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+     int sz,base;
-++    int (*d)[2] = s->dblk_cmds[job];
-++    int low=(*d)[1];
-++    int high=(*d)[1];
-++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-++        int y = (*d)[1];
-++        low=FFMIN(low,y);
-++        high=FFMAX(high,y);
-++    }
-++    curr_y = low;
-++    n = high+(1 << s->ps.sps->log2_ctb_size);
-++    curr_uv = curr_y >> s->ps.sps->vshift[1];
-++    n_uv = n >> s->ps.sps->vshift[1];
-++
-+     sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+     base = s->frame->linesize[1] * curr_uv;
-+-    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+     iocache.s[0].handle = p->vcsm_handle;
-+     iocache.s[0].cmd = 3; // clean+invalidate
-+-    iocache.s[0].addr = p->arm + base;
-++    iocache.s[0].addr = (int)(p->arm) + base;
-+     iocache.s[0].size  = sz;
-+     p = av_buffer_pool_opaque(frame->buf[2]);
-+     iocache.s[1].handle = p->vcsm_handle;
-+     iocache.s[1].cmd = 3; // clean+invalidate
-+-    iocache.s[1].addr = p->arm + base;
-++    iocache.s[1].addr = (int)(p->arm) + base;
-+     iocache.s[1].size  = sz;
-+     p = av_buffer_pool_opaque(frame->buf[0]);
-+     sz = s->frame->linesize[0] * (n-curr_y);
-+     base = s->frame->linesize[0] * curr_y;
-+     iocache.s[2].handle = p->vcsm_handle;
-+     iocache.s[2].cmd = 3; // clean+invalidate
-+-    iocache.s[2].addr = p->arm + base;
-++    iocache.s[2].addr = (int)(p->arm) + base;
-+     iocache.s[2].size  = sz;
-+ 
-+     iocache.s[3].handle = p0->vcsm_handle;
-+-- 
-+2.5.0
-+
-+
-+From 980ce082dd1c0101e2aec64121c9de1d03a287f4 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 8 Jun 2015 11:55:29 +0100
-+Subject: [PATCH 63/68] Packed 16x16 and 32x32 into the same buffer
-+
-+---
-+ libavcodec/hevc.c       | 24 +++++++++++++++---------
-+ libavcodec/hevc_cabac.c |  9 ++++++++-
-+ libavcodec/rpi_qpu.c    |  2 +-
-+ 3 files changed, 24 insertions(+), 11 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index ef61788..8c6db35 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -297,12 +297,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+         s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+         if (!s->coeffs_buf_arm[job][0])
-+             goto fail;
-+-        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
-++        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+         s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+         s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+         if (!s->coeffs_buf_arm[job][2])
-+             goto fail;
-+-        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
-++        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+         s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+       }
-+     }
-+@@ -2943,15 +2943,20 @@ static void rpi_execute_transform(HEVCContext *s)
-+ {
-+     int i=2;
-+     int job = s->pass1_job;
-+-    //int j;
-+-    //int16_t *coeffs = s->coeffs_buf_arm[i];
-+-    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+-    //    s->hevcdsp.idct[4-2](coeffs, 16);
-+-    //}
-++    /*int j;
-++    int16_t *coeffs = s->coeffs_buf_arm[job][i];
-++    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
-++        s->hevcdsp.idct[4-2](coeffs, 16);
-++    }
-++    i=3;
-++    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
-++    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
-++        s->hevcdsp.idct[5-2](coeffs, 32);
-++    }*/
-+ 
-+     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
-+-                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
-++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+                                s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
-+     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+@@ -3445,7 +3450,8 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ #else
-+     flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
-+ #endif
-+-    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
-++                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index 16e7ac3..271e17a 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1051,7 +1051,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     if (s->enable_rpi) {
-+         int n = trafo_size * trafo_size;
-+         if (use_vpu) {
-+-            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-++            // We support size 4 and size 5.
-++            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
-++            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
-++            // num_coeffs is indexed by log2_trafo_size-2
-++            if (log2_trafo_size == 4)
-++                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-++            else
-++                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
-+             s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-+         } else {
-+             coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 4480f72..0121fca 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -5,7 +5,7 @@
-+ // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+ //#define RPI_TIME_TOTAL_VPU
-+ // define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
-+-//#define RPI_TIME_TOTAL_POSTED
-++#define RPI_TIME_TOTAL_POSTED
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+ #define RPI_ASYNC
-+ 
-+-- 
-+2.5.0
-+
-+
-+From dd561eb52a075c09da89bf20f8d18fb92123ec2c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 25 Jun 2015 09:02:47 +0100
-+Subject: [PATCH 64/68] Moved luma deblock to VPU
-+
-+---
-+ libavcodec/hevc.c               |   18 +-
-+ libavcodec/hevc.h               |   11 +
-+ libavcodec/hevc_filter.c        |  120 ++-
-+ libavcodec/rpi_hevc_transform.h | 1802 ++++++++++++++++++++++++++++++++++++++-
-+ libavcodec/rpi_hevc_transform.s |  426 +++++++++
-+ libavcodec/rpi_qpu.c            |   12 +-
-+ libavcodec/rpi_shader.c         |    2 +-
-+ 7 files changed, 2378 insertions(+), 13 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 8c6db35..da4bebb 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -244,6 +244,12 @@ static void pic_arrays_free(HEVCContext *s)
-+       }
-+     }
-+ #endif
-++#ifdef RPI_DEBLOCK_VPU
-++    if (s->y_setup_arm) {
-++      gpu_free(&s->y_setup_ptr);
-++      s->y_setup_arm = 0;
-++    }
-++#endif
-+     av_freep(&s->sao);
-+     av_freep(&s->deblock);
-+ 
-+@@ -281,12 +287,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
-+ 
-+ #ifdef RPI
-+-    av_assert0(sps);
-+     int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+     int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-+     int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-+     int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-+     int job;
-++    av_assert0(sps);
-+     s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+     s->ctu_per_y_chan = s->max_ctu_count / 12;
-+     s->ctu_per_uv_chan = s->max_ctu_count / 8;
-+@@ -307,6 +313,16 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+       }
-+     }
-+ #endif
-++#ifdef RPI_DEBLOCK_VPU
-++    s->enable_rpi_deblock = !sps->sao_enabled;
-++    s->setup_width = (sps->width+15) / 16;
-++    s->setup_height = (sps->height+15) / 16;
-++    gpu_malloc_uncached(sizeof(*s->y_setup_arm) * s->setup_width * s->setup_height, &s->y_setup_ptr); // TODO make this cached
-++    s->y_setup_arm = (void*)s->y_setup_ptr.arm;
-++    s->y_setup_vc = (void*)s->y_setup_ptr.vc;
-++    memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
-++    printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
-++#endif
-+ 
-+     s->bs_width  = (width  >> 2) + 1;
-+     s->bs_height = (height >> 2) + 1;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 6c0d0b6..c933757 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -56,6 +56,8 @@
-+   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+   #define RPI_WORKER
-+ 
-++  #define RPI_DEBLOCK_VPU
-++
-+ #endif
-+ 
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+@@ -997,6 +999,15 @@ typedef struct HEVCContext {
-+     int kill_worker; // set to 1 to terminate the worker
-+ #endif
-+ 
-++#ifdef RPI_DEBLOCK_VPU
-++    int enable_rpi_deblock;
-++    GPU_MEM_PTR_T y_setup_ptr;
-++    uint8_t (*y_setup_arm)[2][2][2][4];
-++    uint8_t (*y_setup_vc)[2][2][2][4];
-++    int setup_width; // Number of 16x16 blocks across the image
-++    int setup_height; // Number of 16x16 blocks down the image
-++#endif
-++
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 1f04790..06371da 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -564,6 +564,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                                                          s->frame->linesize[LUMA],
-+                                                          beta, tc, no_p, no_q);
-+                 } else
-++#ifdef RPI_DEBLOCK_VPU
-++                if (s->enable_rpi_deblock) {
-++                    uint8_t (*setup)[2][2][4];
-++                    int num16 = (y>>4)*s->setup_width + (x>>4);
-++                    int a = ((y>>3) & 1) << 1;
-++                    int b = (x>>3) & 1;
-++                    setup = s->y_setup_arm[num16];
-++                    setup[0][b][0][a] = beta;
-++                    setup[0][b][0][a + 1] = beta;
-++                    setup[0][b][1][a] = tc[0];
-++                    setup[0][b][1][a + 1] = tc[1];
-++                } else
-++#endif
-+                     s->hevcdsp.hevc_v_loop_filter_luma(src,
-+                                                        s->frame->linesize[LUMA],
-+                                                        beta, tc, no_p, no_q);
-+@@ -596,6 +609,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                                                          s->frame->linesize[LUMA],
-+                                                          beta, tc, no_p, no_q);
-+                 } else
-++#ifdef RPI_DEBLOCK_VPU
-++                if (s->enable_rpi_deblock) {
-++                    uint8_t (*setup)[2][2][4];
-++                    int num16 = (y>>4)*s->setup_width + (x>>4);
-++                    int a = ((x>>3) & 1) << 1;
-++                    int b = (y>>3) & 1;
-++                    setup = s->y_setup_arm[num16];
-++                    setup[1][b][0][a] = beta;
-++                    setup[1][b][0][a + 1] = beta;
-++                    setup[1][b][1][a] = tc[0];
-++                    setup[1][b][1][a + 1] = tc[1];
-++                } else
-++#endif
-+                     s->hevcdsp.hevc_h_loop_filter_luma(src,
-+                                                        s->frame->linesize[LUMA],
-+                                                        beta, tc, no_p, no_q);
-+@@ -876,33 +902,85 @@ static void flush_buffer(AVBufferRef *bref) {
-+ }
-+ 
-+ // Return Physical address for this image
-+-static int ff_hevc_buf_base(AVBufferRef *bref) {
-++static uint32_t get_vc_address(AVBufferRef *bref) {
-+   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+-  return p->vc & 0x3fffffff;
-++  return p->vc;
-+ }
-+ 
-++// ff_hevc_flush_buffer_lines
-++// flushes and invalidates all pixel rows in [start,end-1]
-++static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
-++{
-++#ifdef RPI_FAST_CACHEFLUSH
-++        struct vcsm_user_clean_invalid_s iocache = {};
-++        int curr_y = start;
-++        int n = end;
-++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-++        int n_uv = n >> s->ps.sps->vshift[1];
-++        int sz,base;
-++        GPU_MEM_PTR_T *p;
-++        if (curr_uv < 0) curr_uv = 0;
-++        if (n_uv<=curr_uv) { return; }
-++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-++        base = s->frame->linesize[1] * curr_uv;
-++        if (flush_chroma) {
-++          p = av_buffer_pool_opaque(s->frame->buf[1]);
-++          iocache.s[0].handle = p->vcsm_handle;
-++          iocache.s[0].cmd = 3; // clean+invalidate
-++          iocache.s[0].addr = (int)p->arm + base;
-++          iocache.s[0].size  = sz;
-++          p = av_buffer_pool_opaque(s->frame->buf[2]);
-++          iocache.s[1].handle = p->vcsm_handle;
-++          iocache.s[1].cmd = 3; // clean+invalidate
-++          iocache.s[1].addr = (int)p->arm + base;
-++          iocache.s[1].size  = sz;
-++        }
-++        if (flush_luma) {
-++          p = av_buffer_pool_opaque(s->frame->buf[0]);
-++          sz = s->frame->linesize[0] * (n-curr_y);
-++          base = s->frame->linesize[0] * curr_y;
-++          iocache.s[2].handle = p->vcsm_handle;
-++          iocache.s[2].cmd = 3; // clean+invalidate
-++          iocache.s[2].addr = (int)p->arm + base;
-++          iocache.s[2].size  = sz;
-++        }
-++        vcsm_clean_invalid( &iocache );
-++#else
-++        if (flush_chroma) {
-++          flush_buffer(s->frame->buf[1]);
-++          flush_buffer(s->frame->buf[2]);
-++        }
-++        if (flush_luma) {
-++          flush_buffer(s->frame->buf[0]);
-++        }
-++#endif
-++}
-++
-++
-+ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+     if (s->enable_rpi && s->used_for_ref) {
-++      // TODO make this use ff_hevc_flush_buffer_lines
-+ #ifdef RPI_FAST_CACHEFLUSH
-+         struct vcsm_user_clean_invalid_s iocache = {};
-+         int curr_y = ((int *)f->progress->data)[0];
-+         int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+         int n_uv = n >> s->ps.sps->vshift[1];
-+         int sz,base;
-++        GPU_MEM_PTR_T *p;
-+         if (curr_uv < 0) curr_uv = 0;
-+         if (n_uv<=curr_uv) { return; }
-+         sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+         base = s->frame->linesize[1] * curr_uv;
-+-        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-++        p = av_buffer_pool_opaque(s->frame->buf[1]);
-+         iocache.s[0].handle = p->vcsm_handle;
-+         iocache.s[0].cmd = 3; // clean+invalidate
-+-        iocache.s[0].addr = p->arm + base;
-++        iocache.s[0].addr = (int)p->arm + base;
-+         iocache.s[0].size  = sz;
-+         p = av_buffer_pool_opaque(s->frame->buf[2]);
-+         iocache.s[1].handle = p->vcsm_handle;
-+         iocache.s[1].cmd = 3; // clean+invalidate
-+-        iocache.s[1].addr = p->arm + base;
-++        iocache.s[1].addr = (int)p->arm + base;
-+         iocache.s[1].size  = sz;
-+ 
-+ #ifdef RPI_LUMA_QPU
-+@@ -911,7 +989,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+         base = s->frame->linesize[0] * curr_y;
-+         iocache.s[2].handle = p->vcsm_handle;
-+         iocache.s[2].cmd = 3; // clean+invalidate
-+-        iocache.s[2].addr = p->arm + base;
-++        iocache.s[2].addr = (int)p->arm + base;
-+         iocache.s[2].size  = sz;
-+ #endif
-+         vcsm_clean_invalid( &iocache );
-+@@ -930,11 +1008,40 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ }
-+ #endif
-+ 
-++#ifdef RPI_DEBLOCK_VPU
-++/* rpi_deblock deblocks an entire row of ctbs using the VPU */
-++static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
-++{
-++  // Flush image, 4 lines above to bottom of ctb stripe
-++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
-++  // TODO flush buffer of beta/tc setup when it becomes cached
-++  // Call VPU
-++  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
-++  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
-++                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
-++                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
-++}
-++
-++static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
-++{
-++   int y2;
-++   for(y2=y;y2<y+ctb_size;y2+=16) {
-++      rpi_deblock(s,y2,16);
-++   }
-++}
-++#endif
-++
-+ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+ {
-+     int x_end = x >= s->ps.sps->width  - ctb_size;
-+     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
-+         deblocking_filter_CTB(s, x, y);
-++#ifdef RPI_DEBLOCK_VPU
-++    if (s->enable_rpi_deblock && x_end)
-++    {
-++      rpi_deblock(s, y, ctb_size);
-++    }
-++#endif
-+     if (s->ps.sps->sao_enabled) {
-+         int y_end = y >= s->ps.sps->height - ctb_size;
-+         if (y && x)
-+@@ -965,6 +1072,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+         //if (((y + ctb_size)&63)==0)
-+ #ifdef RPI_INTER_QPU
-+         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-++        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index 4f13622..b3f155f 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -3,7 +3,13 @@ unsigned char rpi_hevc_transform [] = {
-+ 106,
-+ 0,
-+ 144,
-+-35,
-++38,
-++1,
-++37,
-++106,
-++0,
-++144,
-++57,
-+ 1,
-+ 169,
-+ 3,
-+@@ -627,4 +633,1798 @@ unsigned char rpi_hevc_transform [] = {
-+ 30,
-+ 90,
-+ 0,
-++169,
-++3,
-++73,
-++64,
-++52,
-++64,
-++45,
-++64,
-++2,
-++64,
-++10,
-++64,
-++64,
-++198,
-++1,
-++7,
-++8,
-++232,
-++63,
-++0,
-++0,
-++0,
-++6,
-++232,
-++253,
-++255,
-++255,
-++255,
-++0,
-++246,
-++0,
-++0,
-++0,
-++4,
-++215,
-++64,
-++3,
-++96,
-++2,
-++248,
-++0,
-++35,
-++0,
-++0,
-++64,
-++56,
-++0,
-++0,
-++4,
-++248,
-++0,
-++36,
-++0,
-++0,
-++64,
-++56,
-++8,
-++0,
-++0,
-++240,
-++64,
-++0,
-++132,
-++3,
-++128,
-++240,
-++0,
-++0,
-++132,
-++3,
-++128,
-++144,
-++137,
-++0,
-++131,
-++98,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++129,
-++0,
-++131,
-++102,
-++0,
-++158,
-++67,
-++0,
-++2,
-++248,
-++0,
-++35,
-++0,
-++0,
-++64,
-++56,
-++0,
-++0,
-++4,
-++248,
-++0,
-++36,
-++0,
-++0,
-++64,
-++56,
-++8,
-++0,
-++0,
-++240,
-++64,
-++0,
-++132,
-++3,
-++128,
-++240,
-++0,
-++0,
-++132,
-++3,
-++128,
-++144,
-++108,
-++0,
-++131,
-++98,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++100,
-++0,
-++131,
-++102,
-++0,
-++248,
-++64,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++248,
-++0,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++144,
-++161,
-++0,
-++188,
-++64,
-++67,
-++232,
-++0,
-++2,
-++0,
-++0,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++150,
-++0,
-++195,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++128,
-++7,
-++192,
-++130,
-++248,
-++0,
-++0,
-++112,
-++192,
-++224,
-++16,
-++195,
-++31,
-++132,
-++248,
-++1,
-++0,
-++112,
-++0,
-++224,
-++16,
-++203,
-++31,
-++3,
-++99,
-++131,
-++71,
-++68,
-++232,
-++32,
-++0,
-++0,
-++0,
-++0,
-++99,
-++2,
-++99,
-++23,
-++102,
-++7,
-++106,
-++127,
-++156,
-++182,
-++255,
-++0,
-++248,
-++64,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++248,
-++0,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++144,
-++112,
-++0,
-++188,
-++64,
-++67,
-++232,
-++0,
-++2,
-++0,
-++0,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++101,
-++0,
-++195,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++128,
-++7,
-++192,
-++130,
-++248,
-++0,
-++0,
-++112,
-++192,
-++224,
-++16,
-++195,
-++31,
-++132,
-++248,
-++1,
-++0,
-++112,
-++0,
-++224,
-++16,
-++203,
-++31,
-++25,
-++102,
-++9,
-++106,
-++2,
-++30,
-++41,
-++3,
-++26,
-++87,
-++162,
-++64,
-++64,
-++198,
-++1,
-++23,
-++127,
-++158,
-++103,
-++255,
-++239,
-++3,
-++0,
-++254,
-++0,
-++143,
-++92,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++64,
-++143,
-++93,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++143,
-++94,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++143,
-++95,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++142,
-++208,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++142,
-++209,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++64,
-++142,
-++210,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++0,
-++142,
-++211,
-++0,
-++0,
-++240,
-++12,
-++0,
-++128,
-++144,
-++107,
-++0,
-++8,
-++255,
-++99,
-++23,
-++0,
-++212,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++163,
-++23,
-++0,
-++228,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++227,
-++23,
-++0,
-++244,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++35,
-++52,
-++0,
-++180,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++99,
-++52,
-++0,
-++164,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++163,
-++52,
-++0,
-++148,
-++192,
-++51,
-++0,
-++0,
-++111,
-++3,
-++239,
-++3,
-++0,
-++254,
-++0,
-++143,
-++12,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++64,
-++143,
-++13,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++143,
-++14,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++143,
-++15,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++142,
-++16,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++142,
-++17,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++64,
-++142,
-++18,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++0,
-++142,
-++19,
-++0,
-++0,
-++240,
-++12,
-++0,
-++128,
-++144,
-++33,
-++0,
-++8,
-++255,
-++99,
-++3,
-++0,
-++212,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++163,
-++3,
-++0,
-++228,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++227,
-++3,
-++0,
-++244,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++35,
-++4,
-++0,
-++180,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++99,
-++4,
-++0,
-++164,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++163,
-++4,
-++0,
-++148,
-++192,
-++51,
-++0,
-++0,
-++111,
-++3,
-++32,
-++246,
-++192,
-++11,
-++1,
-++16,
-++32,
-++246,
-++2,
-++137,
-++47,
-++240,
-++40,
-++246,
-++2,
-++140,
-++47,
-++240,
-++128,
-++245,
-++99,
-++140,
-++5,
-++4,
-++0,
-++247,
-++99,
-++140,
-++1,
-++20,
-++88,
-++246,
-++99,
-++140,
-++1,
-++20,
-++0,
-++247,
-++35,
-++136,
-++62,
-++226,
-++32,
-++247,
-++35,
-++136,
-++32,
-++210,
-++0,
-++247,
-++34,
-++136,
-++63,
-++2,
-++208,
-++246,
-++34,
-++136,
-++0,
-++4,
-++0,
-++247,
-++99,
-++136,
-++58,
-++162,
-++32,
-++247,
-++99,
-++136,
-++33,
-++146,
-++0,
-++247,
-++98,
-++136,
-++59,
-++18,
-++208,
-++246,
-++98,
-++136,
-++0,
-++20,
-++0,
-++247,
-++162,
-++136,
-++33,
-++2,
-++88,
-++246,
-++98,
-++137,
-++2,
-++68,
-++88,
-++246,
-++162,
-++137,
-++3,
-++68,
-++208,
-++254,
-++227,
-++136,
-++60,
-++242,
-++192,
-++243,
-++188,
-++11,
-++208,
-++254,
-++227,
-++136,
-++56,
-++178,
-++192,
-++243,
-++188,
-++10,
-++32,
-++255,
-++226,
-++136,
-++38,
-++58,
-++192,
-++243,
-++60,
-++0,
-++208,
-++254,
-++227,
-++136,
-++59,
-++242,
-++192,
-++243,
-++60,
-++128,
-++32,
-++255,
-++226,
-++136,
-++49,
-++58,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++226,
-++136,
-++34,
-++34,
-++192,
-++243,
-++60,
-++128,
-++32,
-++255,
-++226,
-++136,
-++37,
-++58,
-++192,
-++243,
-++60,
-++128,
-++0,
-++254,
-++192,
-++136,
-++1,
-++4,
-++0,
-++240,
-++0,
-++160,
-++0,
-++255,
-++194,
-++8,
-++0,
-++52,
-++195,
-++243,
-++0,
-++128,
-++0,
-++255,
-++202,
-++40,
-++0,
-++52,
-++195,
-++243,
-++0,
-++128,
-++0,
-++254,
-++0,
-++240,
-++35,
-++10,
-++0,
-++240,
-++60,
-++0,
-++0,
-++254,
-++192,
-++136,
-++1,
-++4,
-++0,
-++240,
-++0,
-++160,
-++0,
-++255,
-++226,
-++140,
-++34,
-++34,
-++195,
-++243,
-++60,
-++0,
-++32,
-++255,
-++227,
-++140,
-++36,
-++58,
-++192,
-++243,
-++60,
-++0,
-++0,
-++254,
-++192,
-++136,
-++0,
-++4,
-++0,
-++240,
-++0,
-++160,
-++16,
-++246,
-++226,
-++136,
-++35,
-++50,
-++16,
-++246,
-++226,
-++136,
-++35,
-++50,
-++32,
-++246,
-++226,
-++136,
-++35,
-++50,
-++32,
-++254,
-++226,
-++136,
-++35,
-++58,
-++192,
-++243,
-++60,
-++0,
-++11,
-++96,
-++0,
-++254,
-++0,
-++240,
-++1,
-++4,
-++0,
-++240,
-++64,
-++115,
-++5,
-++106,
-++0,
-++144,
-++173,
-++1,
-++27,
-++96,
-++0,
-++254,
-++0,
-++240,
-++1,
-++4,
-++0,
-++240,
-++64,
-++147,
-++5,
-++106,
-++0,
-++144,
-++227,
-++0,
-++64,
-++246,
-++163,
-++140,
-++1,
-++4,
-++0,
-++246,
-++192,
-++175,
-++63,
-++2,
-++0,
-++246,
-++192,
-++174,
-++59,
-++2,
-++0,
-++246,
-++128,
-++175,
-++62,
-++2,
-++0,
-++246,
-++128,
-++174,
-++58,
-++2,
-++0,
-++246,
-++64,
-++175,
-++61,
-++2,
-++0,
-++246,
-++64,
-++174,
-++57,
-++2,
-++0,
-++255,
-++43,
-++240,
-++4,
-++212,
-++192,
-++243,
-++128,
-++11,
-++64,
-++254,
-++43,
-++240,
-++1,
-++228,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++240,
-++1,
-++244,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++240,
-++1,
-++180,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++141,
-++0,
-++164,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++32,
-++247,
-++35,
-++141,
-++191,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++235,
-++143,
-++52,
-++242,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++2,
-++212,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++43,
-++240,
-++191,
-++226,
-++192,
-++243,
-++188,
-++10,
-++64,
-++254,
-++43,
-++141,
-++0,
-++180,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++2,
-++68,
-++32,
-++247,
-++35,
-++141,
-++190,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++171,
-++143,
-++52,
-++226,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++4,
-++180,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++43,
-++240,
-++191,
-++226,
-++192,
-++243,
-++188,
-++10,
-++128,
-++253,
-++43,
-++240,
-++3,
-++212,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++35,
-++141,
-++1,
-++196,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++32,
-++247,
-++35,
-++141,
-++189,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++107,
-++143,
-++52,
-++210,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++4,
-++148,
-++192,
-++243,
-++128,
-++11,
-++64,
-++254,
-++43,
-++240,
-++1,
-++164,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++240,
-++1,
-++180,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++240,
-++1,
-++244,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++141,
-++0,
-++228,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++32,
-++247,
-++35,
-++141,
-++187,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++235,
-++142,
-++52,
-++178,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++2,
-++148,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++43,
-++240,
-++187,
-++162,
-++192,
-++243,
-++188,
-++10,
-++64,
-++254,
-++43,
-++141,
-++0,
-++244,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++2,
-++68,
-++32,
-++247,
-++35,
-++141,
-++186,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++171,
-++142,
-++52,
-++162,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++4,
-++244,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++43,
-++240,
-++187,
-++162,
-++192,
-++243,
-++188,
-++10,
-++128,
-++253,
-++43,
-++240,
-++3,
-++148,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++35,
-++141,
-++1,
-++132,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++32,
-++247,
-++35,
-++141,
-++185,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++107,
-++142,
-++52,
-++146,
-++192,
-++243,
-++60,
-++128,
-++64,
-++255,
-++98,
-++141,
-++0,
-++52,
-++192,
-++243,
-++0,
-++0,
-++0,
-++254,
-++0,
-++240,
-++53,
-++10,
-++0,
-++240,
-++60,
-++0,
-++0,
-++254,
-++0,
-++240,
-++1,
-++4,
-++0,
-++240,
-++64,
-++147,
-++5,
-++106,
-++0,
-++144,
-++177,
-++0,
-++88,
-++246,
-++163,
-++140,
-++1,
-++4,
-++128,
-++245,
-++99,
-++141,
-++10,
-++4,
-++88,
-++246,
-++162,
-++138,
-++1,
-++68,
-++0,
-++247,
-++162,
-++138,
-++36,
-++162,
-++88,
-++254,
-++162,
-++138,
-++3,
-++164,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++226,
-++137,
-++32,
-++2,
-++195,
-++243,
-++60,
-++0,
-++32,
-++247,
-++226,
-++137,
-++42,
-++114,
-++0,
-++255,
-++34,
-++138,
-++33,
-++18,
-++195,
-++243,
-++60,
-++0,
-++32,
-++247,
-++34,
-++138,
-++42,
-++130,
-++16,
-++246,
-++98,
-++138,
-++40,
-++114,
-++16,
-++246,
-++98,
-++138,
-++41,
-++146,
-++32,
-++246,
-++98,
-++138,
-++41,
-++146,
-++32,
-++246,
-++226,
-++137,
-++41,
-++146,
-++40,
-++246,
-++34,
-++138,
-++41,
-++146,
-++32,
-++247,
-++163,
-++141,
-++63,
-++178,
-++32,
-++247,
-++227,
-++141,
-++62,
-++162,
-++0,
-++254,
-++0,
-++240,
-++8,
-++4,
-++0,
-++240,
-++128,
-++11,
-++128,
-++253,
-++35,
-++240,
-++9,
-++100,
-++192,
-++243,
-++128,
-++10,
-++128,
-++253,
-++163,
-++141,
-++128,
-++115,
-++192,
-++243,
-++152,
-++10,
-++88,
-++246,
-++163,
-++141,
-++4,
-++100,
-++208,
-++246,
-++35,
-++139,
-++0,
-++100,
-++32,
-++255,
-++34,
-++139,
-++53,
-++202,
-++192,
-++243,
-++60,
-++128,
-++0,
-++254,
-++0,
-++139,
-++0,
-++4,
-++0,
-++240,
-++0,
-++160,
-++240,
-++246,
-++163,
-++141,
-++48,
-++98,
-++0,
-++247,
-++99,
-++139,
-++63,
-++210,
-++0,
-++247,
-++98,
-++139,
-++1,
-++212,
-++88,
-++254,
-++98,
-++139,
-++1,
-++212,
-++192,
-++243,
-++128,
-++11,
-++32,
-++255,
-++99,
-++139,
-++62,
-++98,
-++192,
-++243,
-++188,
-++10,
-++88,
-++246,
-++98,
-++139,
-++1,
-++212,
-++240,
-++246,
-++98,
-++139,
-++50,
-++210,
-++0,
-++247,
-++163,
-++128,
-++59,
-++146,
-++0,
-++247,
-++160,
-++128,
-++1,
-++36,
-++88,
-++254,
-++160,
-++128,
-++1,
-++36,
-++192,
-++243,
-++128,
-++11,
-++0,
-++247,
-++163,
-++128,
-++58,
-++98,
-++64,
-++255,
-++35,
-++240,
-++0,
-++100,
-++192,
-++243,
-++128,
-++10,
-++64,
-++255,
-++163,
-++128,
-++0,
-++164,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++160,
-++128,
-++1,
-++36,
-++240,
-++246,
-++160,
-++128,
-++50,
-++34,
-++8,
-++255,
-++227,
-++143,
-++54,
-++242,
-++192,
-++243,
-++60,
-++128,
-++40,
-++255,
-++227,
-++142,
-++54,
-++178,
-++192,
-++243,
-++60,
-++128,
-++0,
-++254,
-++0,
-++240,
-++39,
-++10,
-++0,
-++240,
-++60,
-++128,
-++8,
-++255,
-++163,
-++143,
-++45,
-++226,
-++192,
-++243,
-++60,
-++128,
-++0,
-++254,
-++0,
-++240,
-++44,
-++10,
-++0,
-++240,
-++60,
-++0,
-++0,
-++254,
-++0,
-++240,
-++40,
-++10,
-++0,
-++240,
-++60,
-++128,
-++8,
-++255,
-++163,
-++142,
-++2,
-++162,
-++192,
-++243,
-++60,
-++128,
-++90,
-++0,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index fd159bc..b055208 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -83,6 +83,8 @@
-+ hevc_trans_16x16:
-+   cmp r5,1
-+   beq memclear16
-++  cmp r5,2
-++  beq hevc_deblock_16x16
-+   push r6-r15, lr # TODO cut down number of used registers
-+   mov r14,r3 # coeffs32
-+   mov r15,r4 # num32
-+@@ -282,3 +284,427 @@ loop:
-+   cmp r1,0
-+   bgt loop
-+   b lr
-++
-++
-++################################################################################
-++# HEVC VPU Deblock
-++#
-++# Vertical edges before horizontal
-++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
-++#
-++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
-++# The VPU code works in units of 16x16 blocks.
-++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
-++# One final horizontal filter is required at the end.
-++# PCM is not allowed in this code.
-++#
-++#
-++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
-++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
-++
-++.set P0,63
-++.set P1,62
-++.set P2,61
-++.set P3,60
-++.set Q0,59
-++.set Q1,58
-++.set Q2,57
-++.set Q3,56
-++
-++.set dp,32
-++.set dq,33
-++.set d,34
-++.set decision,35
-++.set beta,36
-++.set beta2,37
-++.set beta3,38
-++.set ptest,39
-++.set qtest,40
-++.set pqtest,41
-++.set thresh,42
-++.set deltatest, 44
-++.set deltap1, 45
-++.set tc25, 46
-++.set setup,47
-++.set tc,48
-++.set tc25,49
-++.set tc2, 50
-++.set do_filter, 51
-++.set delta, 52
-++.set tc10, 53
-++.set delta0, 54
-++.set delta1, 55
-++.set zeros, 0
-++.set setup_input, 1
-++.set deltaq1, 2
-++
-++
-++
-++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
-++# Row has num16 16x16 blocks across
-++# Beta goes from 0 to 64
-++# tc goes from 0 to 24
-++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
-++#   has 8 bytes per edge
-++#   has 16 bytes per direction
-++#   has 32 bytes per 16x16 block
-++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
-++hevc_deblock_16x16:
-++  push r6-r15, lr
-++  mov r9,r4
-++  mov r4,r3
-++  mov r13,r2
-++  mov r2,r0
-++  mov r10,r0
-++  subscale4 r0,r1
-++  mov r8,63
-++  mov r6,-3
-++  vmov H(zeros,0),0
-++# r7 is number of blocks still to load
-++# r0 is location of current block - 4 * stride
-++# r1 is stride
-++# r2 is location of current block
-++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-++# r4 is setup
-++# r5 is for temporary calculations
-++# r8 holds 63
-++# r6 holds -3
-++# r9 holds the number of 16 high rows to process
-++# r10 holds the original img base
-++# r11 returns 0 if no filtering was done on the edge
-++# r12 saves a copy of this
-++# r13 is copy of width
-++
-++process_row:
-++  # First iteration does not do horizontal filtering on previous
-++  mov r7, r13
-++  mov r3,0
-++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-++  vldb H(16++,16)+r3,(r2 += r1) REP 16
-++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-++  vstb H(zeros,0),(r4)
-++  bl vert_filter
-++  add r3,8
-++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-++  bl vert_filter
-++  sub r3,8
-++  b start_deblock_loop
-++deblock_loop:
-++  # Middle iterations do vertical on current block and horizontal on preceding
-++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-++  vldb H(16++,16)+r3,(r2 += r1) REP 16
-++  vldb H(setup_input,0), (r4)
-++  vstb H(zeros,0),(r4)
-++  bl vert_filter
-++  add r3,8
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl vert_filter
-++  sub r3,8
-++  vldb H(setup_input,0), -16(r4)
-++  vstb H(zeros,0),-16(r4)
-++  bl horz_filter
-++  mov r12,r11
-++  add r3,8*64
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl horz_filter
-++  sub r3,8*64
-++  addcmpbeq r12,0,0,skip_save_top
-++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-++skip_save_top:
-++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-++start_deblock_loop:
-++  # move onto next 16x16 (could do this with circular buffer support instead)
-++  add r3,16
-++  and r3,r8
-++  add r4,32
-++  # Perform loop counter operations (may work with an addcmpbgt as well?)
-++  add r0,16
-++  add r2,16
-++  sub r7,1
-++  cmp r7,0 # Are there still more blocks to load
-++  bgt deblock_loop
-++
-++  # Final iteration needs to just do horizontal filtering
-++  vldb H(setup_input,0), -16(r4)
-++  vstb H(zeros,0),-16(r4)
-++  bl horz_filter
-++  mov r12,r11
-++  add r3,8*64
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl horz_filter
-++  sub r3,64*8
-++  addcmpbeq r12,0,0,skip_save_top2
-++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-++skip_save_top2:
-++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-++
-++# Now look to see if we should do another row
-++  sub r9,1
-++  cmp r9,0
-++  bgt start_again
-++  pop r6-r15, pc
-++start_again:
-++  # Need to sort out r0,r2 to point to next row down
-++  addscale16 r10,r1
-++  mov r2,r10
-++  subscale4 r0,r2,r1
-++  b process_row
-++
-++
-++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-++
-++vert_filter:
-++  push lr
-++
-++  vmov HX(P3,0), V(16,12)+r3
-++  vmov HX(P2,0), V(16,13)+r3
-++  vmov HX(P1,0), V(16,14)+r3
-++  vmov HX(P0,0), V(16,15)+r3
-++  vmov HX(Q0,0), V(16,16)+r3
-++  vmov HX(Q1,0), V(16,17)+r3
-++  vmov HX(Q2,0), V(16,18)+r3
-++  vmov HX(Q3,0), V(16,19)+r3
-++
-++  bl do_luma_filter
-++
-++  vadds V(16,13)+r3, HX(P2,0), 0
-++  vadds V(16,14)+r3, HX(P1,0), 0
-++  vadds V(16,15)+r3, HX(P0,0), 0
-++  # P3 and Q3 never change so don't bother saving back
-++  vadds V(16,16)+r3, HX(Q0,0), 0
-++  vadds V(16,17)+r3, HX(Q1,0), 0
-++  vadds V(16,18)+r3, HX(Q2,0), 0
-++
-++  pop pc
-++
-++# Filter edge at H(16,0)+r3
-++horz_filter:
-++  push lr
-++
-++  vmov HX(P3,0), H(12,0)+r3
-++  vmov HX(P2,0), H(13,0)+r3
-++  vmov HX(P1,0), H(14,0)+r3
-++  vmov HX(P0,0), H(15,0)+r3
-++  vmov HX(Q0,0), H(16,0)+r3
-++  vmov HX(Q1,0), H(17,0)+r3
-++  vmov HX(Q2,0), H(18,0)+r3
-++  vmov HX(Q3,0), H(19,0)+r3
-++
-++  bl do_luma_filter
-++
-++  vadds H(13,0)+r3, HX(P2,0), 0
-++  vadds H(14,0)+r3, HX(P1,0), 0
-++  vadds H(15,0)+r3, HX(P0,0), 0
-++  # P3 and Q3 never change so don't bother saving back
-++  vadds H(16,0)+r3, HX(Q0,0), 0
-++  vadds H(17,0)+r3, HX(Q1,0), 0
-++  vadds H(18,0)+r3, HX(Q2,0), 0
-++
-++  pop pc
-++
-++# r4 points to array of beta/tc for each 4 length edge
-++do_luma_filter:
-++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
-++  valtl HX(beta,0),H(setup,0),H(setup,0)
-++  valtu HX(tc,0),H(setup,0),H(setup,0)
-++  vmul HX(tc25,0), HX(tc,0), 5
-++  vadd HX(tc25,0),HX(tc25,0), 1
-++  vasr HX(tc25,0), HX(tc25,0), 1
-++
-++  # Compute decision
-++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
-++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
-++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
-++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
-++
-++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
-++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
-++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
-++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
-++
-++  vadd HX(d,0), HX(dp,0), HX(dq,0)
-++  vasr HX(beta2,0),HX(beta,0),2
-++  vasr HX(beta3,0),HX(beta,0),3
-++
-++  # Compute flags that are negative if all conditions pass
-++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
-++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
-++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
-++
-++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
-++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
-++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
-++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
-++  vmov HX(decision,0), 1 IFNN
-++  vadd H(decision,0),H(decision,3),0 IFN
-++  vadd H(decision,16),H(decision,19),0 IFN
-++  vmov -,HX(decision,0) SETF   # N marks strong filter
-++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
-++
-++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
-++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
-++  vmov HX(decision,0),0 IFNN # Z marks no filter
-++
-++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
-++  # First extract out even terms
-++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
-++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
-++  # Now expand back
-++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
-++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
-++
-++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
-++
-++  # Do a quick check to see if there is anything to do
-++  mov r11, 0 # Signal no filtering
-++  vmov -,1 IFNZ SUMS r5
-++  cmp r5,0
-++  beq filtering_done
-++  mov r11, 1 # Signal some filtering
-++  # And whether there is any strong filtering
-++  vmov -,1 IFN SUMS r5
-++  cmp r5,0
-++  beq normal_filtering
-++
-++  ##############################################################################
-++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
-++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
-++
-++  # Take a copy of the original pixels for use in decision calculation
-++  vmov HX(P0,32),HX(P0,0)
-++  vmov HX(Q0,32),HX(Q0,0)
-++  vmov HX(P1,32),HX(P1,0)
-++  vmov HX(Q1,32),HX(Q1,0)
-++  vmov HX(P2,32),HX(P2,0)
-++  vmov HX(Q2,32),HX(Q2,0)
-++
-++  vadd -,HX(P2,32),4 CLRA SACC
-++  vshl -,HX(P1,32),1 SACC
-++  vshl -,HX(P0,32),1 SACC
-++  vshl -,HX(Q0,32),1 SACC
-++  vshl HX(delta,0),HX(Q1,32),0 SACC
-++  vasr HX(delta,0),HX(delta,0), 3
-++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
-++
-++  vadd -,HX(P2,32),2 CLRA SACC
-++  vadd -,HX(P1,32),HX(P0,32) SACC
-++  vshl HX(delta,0),HX(Q0,32),0 SACC
-++  vasr HX(delta,0),HX(delta,0), 2
-++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
-++
-++  vadd -,HX(Q0,32),4 CLRA SACC
-++  vadd -,HX(P1,32),HX(P0,32) SACC
-++  vmul -,HX(P2,32),3 SACC
-++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
-++  vasr HX(delta,0),HX(delta,0), 3
-++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
-++  #vmov HX(P2,0),3 IFN
-++
-++  # Now reverse all P/Qs
-++
-++  vadd -,HX(Q2,32),4 CLRA SACC
-++  vshl -,HX(Q1,32),1 SACC
-++  vshl -,HX(Q0,32),1 SACC
-++  vshl -,HX(P0,32),1 SACC
-++  vshl HX(delta,0),HX(P1,32),0 SACC
-++  vasr HX(delta,0),HX(delta,0), 3
-++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
-++
-++  vadd -,HX(Q2,32),2 CLRA SACC
-++  vadd -,HX(Q1,32),HX(Q0,32) SACC
-++  vshl HX(delta,0),HX(P0,32),0 SACC
-++  vasr HX(delta,0),HX(delta,0), 2
-++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
-++
-++  vadd -,HX(P0,32),4 CLRA SACC
-++  vadd -,HX(Q1,32),HX(Q0,32) SACC
-++  vmul -,HX(Q2,32),3 SACC
-++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
-++  vasr HX(delta,0),HX(delta,0), 3
-++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
-++
-++  ##############################################################################
-++  # Normal filtering
-++normal_filtering:
-++  # Invert the decision flags
-++  # make instruction more complicated as assembler has error and loses SETF
-++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
-++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
-++
-++  vmov -,1 IFN SUMS r5
-++  cmp r5,0
-++  beq filtering_done
-++
-++  vasr HX(tc2,0), HX(tc,0), 1
-++  vmul HX(tc10,0), HX(tc,0), 10
-++
-++  vasr HX(thresh,0), HX(beta,0), 1
-++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
-++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
-++
-++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
-++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
-++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
-++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
-++  # Expand ptest and qtest together
-++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
-++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
-++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
-++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
-++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
-++
-++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
-++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
-++  vmov -,8 CLRA SACC
-++  vmul -,HX(delta0,0), 9 SACC
-++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
-++  vasr HX(delta0,0), HX(delta0,0), 4
-++  vdist HX(deltatest,0), HX(delta0,0), 0
-++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
-++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
-++
-++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
-++
-++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
-++  vadd HX(deltap1,0), HX(deltap1,0), 1
-++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
-++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
-++  vasr HX(deltap1,0), HX(deltap1,0), 1
-++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
-++
-++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
-++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
-++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
-++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
-++  vrsub -, HX(delta0,0), 0 SACC
-++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
-++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
-++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
-++
-++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
-++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
-++
-++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
-++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
-++
-++  vmov -,HX(deltatest,0) SETF
-++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
-++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
-++
-++  #vmov HX(P2,0),1 IFN
-++
-++filtering_done:
-++  b lr
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 0121fca..05b2169 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -147,7 +147,7 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   vcsm_init();
-+   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+-  memset(ptr, 0, sizeof *ptr);
-++  memset((void*)ptr, 0, sizeof *ptr);
-+   vc = gpu_mem_ptr.vc;
-+ 
-+   ptr->mb = mb;
-+@@ -254,7 +254,7 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+     struct vcsm_user_clean_invalid_s iocache = {};
-+     iocache.s[0].handle = p->vcsm_handle;
-+     iocache.s[0].cmd = 3; // clean+invalidate
-+-    iocache.s[0].addr = p->arm;
-++    iocache.s[0].addr = (int) p->arm;
-+     iocache.s[0].size  = p->numbytes;
-+     vcsm_clean_invalid( &iocache );
-+ #else
-+@@ -390,6 +390,7 @@ static void *vpu_start(void *arg) {
-+ #ifdef RPI_TIME_TOTAL_POSTED
-+   int last_time=0;
-+   long long on_time=0;
-++  long long on_time_deblock=0;
-+   long long off_time=0;
-+   int start_time;
-+   int end_time;
-+@@ -451,10 +452,13 @@ static void *vpu_start(void *arg) {
-+ #ifdef RPI_TIME_TOTAL_POSTED
-+     end_time = Microseconds();
-+     last_time = end_time;
-+-    on_time += end_time - start_time;
-++    if (p[6]==2)
-++      on_time_deblock += end_time - start_time;
-++    else
-++      on_time += end_time - start_time;
-+     count++;
-+     if ((count&0x7f)==0)
-+-      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-++      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
-+ #endif
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index e86eb30..c5d8b29 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -61,7 +61,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+ /* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-+ /* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
-+-/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
-++/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif,r2
-+ /* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
-+ /* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
-+ /* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-- 
-+2.5.0
-+
-+
-+From 8864ce029b80325be328e0b2493f5ba18b10c906 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 1 Jul 2015 09:21:17 +0100
-+Subject: [PATCH 65/68] Added ability to combine jobs
-+
-+---
-+ libavcodec/rpi_qpu.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++-
-+ 1 file changed, 80 insertions(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 05b2169..91777be 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -8,6 +8,8 @@
-+ #define RPI_TIME_TOTAL_POSTED
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+ #define RPI_ASYNC
-++// Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
-++#define RPI_COMBINE_JOBS
-+ 
-+ #include <stdio.h>
-+ #include <stdlib.h>
-+@@ -398,9 +400,15 @@ static void *vpu_start(void *arg) {
-+ #endif
-+   while(1) {
-+     int i;
-+-    int *p;
-++    int *p; // Pointer for a QPU/VPU job
-++#ifdef RPI_COMBINE_JOBS
-++    int *q = NULL; // Pointer for a VPU only job
-++    int have_qpu = 0;
-++    int have_vpu = 0;
-++#endif
-+     int qpu_code;
-+     int qpu_codeb;
-++    int num_jobs; // Number of jobs available
-+     pthread_mutex_lock(&post_mutex);
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+@@ -408,13 +416,38 @@ static void *vpu_start(void *arg) {
-+       pthread_cond_wait(&post_cond_tail, &post_mutex);
-+     }
-+     p = vpu_cmds[vpu_async_head%MAXCMDS];
-++    num_jobs = vpu_async_tail - vpu_async_head;
-+     pthread_mutex_unlock(&post_mutex);
-+ 
-+     if (p[6] == -1) {
-+       break; // Last job
-+     }
-++    if (p[7] == 0 && p[0] == 0 && p[16]==0)
-++      goto job_done_early;
-++
-++#ifdef RPI_COMBINE_JOBS
-++    // First scan for a qpu job
-++    for (int x=0;x<num_jobs;x++) {
-++      p = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
-++      if (p[7]) {
-++        have_qpu = 1;
-++        break;
-++      }
-++    }
-++    // Now scan for a non-qpu job
-++    for (int x=0;x<num_jobs;x++) {
-++      q = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
-++      if (!q[7]) {
-++        have_vpu = 1;
-++        break;
-++      }
-++    }
-++    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
-++#endif
-+     qpu_code = p[7];
-+     qpu_codeb = p[16];
-++
-++
-+     //if (p[7]) {
-+         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+         //gpu_cache_flush(buf);
-+@@ -427,6 +460,40 @@ static void *vpu_start(void *arg) {
-+     off_time += start_time-last_time;
-+ #endif
-+ 
-++#ifdef RPI_COMBINE_JOBS
-++    if (have_qpu) {
-++      for(i=0;i<8;i++) {
-++        gpu->mail[i*2] = p[8+i];
-++        gpu->mail[i*2 + 1] = qpu_code;
-++      }
-++      for(i=0;i<12;i++) {
-++        gpu->mail2[i*2] = p[17+i];
-++        gpu->mail2[i*2 + 1] = qpu_codeb;
-++      }
-++      if (have_vpu) {
-++        execute_multi(gpu->mb,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-++                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-++                              q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
-++        q[0] = 0;
-++      } else {
-++        execute_multi(gpu->mb,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-++                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-++                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-++      }
-++      p[0] = 0;
-++      p[7] = 0;
-++      p[16] = 0;
-++    } else {
-++        av_assert0(have_vpu);
-++        vpu_execute_code(q[0], q[1], q[2], q[3], q[4], q[5], q[6]);
-++        q[0] = 0;
-++    }
-++#else
-++
-+     if (!qpu_code) {
-+       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+     } else {
-+@@ -449,17 +516,29 @@ static void *vpu_start(void *arg) {
-+                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+ #endif
-+     }
-++#endif
-++
-+ #ifdef RPI_TIME_TOTAL_POSTED
-+     end_time = Microseconds();
-+     last_time = end_time;
-++#ifdef RPI_COMBINE_JOBS
-++    // There are three cases we may wish to distinguish of VPU/QPU activity
-++    on_time += end_time - start_time;
-++#else
-+     if (p[6]==2)
-+       on_time_deblock += end_time - start_time;
-+     else
-+       on_time += end_time - start_time;
-++#endif
-+     count++;
-+     if ((count&0x7f)==0)
-++#ifdef RPI_COMBINE_JOBS
-+       printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
-++#else
-++      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-+ #endif
-++job_done_early:
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+     pthread_cond_broadcast(&post_cond_head);
-+-- 
-+2.5.0
-+
-+
-+From 8289de8799cb666404d8d1a01c211a7be17bae61 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 1 Jul 2015 12:53:10 +0100
-+Subject: [PATCH 66/68] Added chroma deblocking
-+
-+---
-+ libavcodec/hevc.c               |  20 ++
-+ libavcodec/hevc.h               |  12 +-
-+ libavcodec/hevc_filter.c        |  92 +++++-
-+ libavcodec/rpi_hevc_transform.h | 644 +++++++++++++++++++++++++++++++++++++++-
-+ libavcodec/rpi_hevc_transform.s | 207 +++++++++++++
-+ libavcodec/rpi_qpu.c            |  27 +-
-+ libavcodec/rpi_shader.qasm      |  11 +
-+ 7 files changed, 988 insertions(+), 25 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index da4bebb..d56f777 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -249,6 +249,14 @@ static void pic_arrays_free(HEVCContext *s)
-+       gpu_free(&s->y_setup_ptr);
-+       s->y_setup_arm = 0;
-+     }
-++    if (s->uv_setup_arm) {
-++      gpu_free(&s->uv_setup_ptr);
-++      s->uv_setup_arm = 0;
-++    }
-++    if (s->vpu_cmds_arm) {
-++      gpu_free(&s->vpu_cmds_ptr);
-++      s->vpu_cmds_arm = 0;
-++    }
-+ #endif
-+     av_freep(&s->sao);
-+     av_freep(&s->deblock);
-+@@ -322,6 +330,18 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+     s->y_setup_vc = (void*)s->y_setup_ptr.vc;
-+     memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
-+     printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
-++
-++    s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
-++    s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
-++    gpu_malloc_uncached(sizeof(*s->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height, &s->uv_setup_ptr); // TODO make this cached
-++    s->uv_setup_arm = (void*)s->uv_setup_ptr.arm;
-++    s->uv_setup_vc = (void*)s->uv_setup_ptr.vc;
-++    memset(s->uv_setup_arm, 0, s->uv_setup_ptr.numbytes);
-++    printf("Setup uv %d by %d by %d\n",s->uv_setup_width,s->uv_setup_height,sizeof(*s->uv_setup_arm));
-++
-++    gpu_malloc_uncached(sizeof(*s->vpu_cmds_arm) * 3,&s->vpu_cmds_ptr);
-++    s->vpu_cmds_arm = (void*) s->vpu_cmds_ptr.arm;
-++    s->vpu_cmds_vc = s->vpu_cmds_ptr.vc;
-+ #endif
-+ 
-+     s->bs_width  = (width  >> 2) + 1;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index c933757..6675a4f 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -55,7 +55,7 @@
-+   #define RPI_MAX_JOBS 2
-+   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+   #define RPI_WORKER
-+-
-++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
-+   #define RPI_DEBLOCK_VPU
-+ 
-+ #endif
-+@@ -1006,6 +1006,16 @@ typedef struct HEVCContext {
-+     uint8_t (*y_setup_vc)[2][2][2][4];
-+     int setup_width; // Number of 16x16 blocks across the image
-+     int setup_height; // Number of 16x16 blocks down the image
-++
-++    GPU_MEM_PTR_T uv_setup_ptr;
-++    uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
-++    uint8_t (*uv_setup_vc)[2][2][2][4];
-++    int uv_setup_width;
-++    int uv_setup_height;
-++
-++    GPU_MEM_PTR_T vpu_cmds_ptr;
-++    int (*vpu_cmds_arm)[6]; // r0-r5 for each command
-++    int vpu_cmds_vc;
-+ #endif
-+ 
-+ #endif
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 06371da..6367068 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -656,9 +656,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                                                                    s->frame->linesize[chroma],
-+                                                                    c_tc, no_p, no_q);
-+                         } else
-++#ifdef RPI_DEBLOCK_VPU
-++                        if (s->enable_rpi_deblock) {
-++                            uint8_t (*setup)[2][2][4];
-++                            int xc = x>>s->ps.sps->hshift[chroma];
-++                            int yc = y>>s->ps.sps->vshift[chroma];
-++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
-++                            int a = ((yc>>3) & 1) << 1;
-++                            int b = (xc>>3) & 1;
-++                            setup = s->uv_setup_arm[num16];
-++                            setup[0][b][0][a] = c_tc[0];
-++                            setup[0][b][0][a + 1] = c_tc[1];
-++                        } else
-++#endif
-+                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
-+                                                                  s->frame->linesize[chroma],
-+                                                                  c_tc, no_p, no_q);
-++
-+                     }
-+                 }
-+ 
-+@@ -689,6 +703,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                                                                    s->frame->linesize[chroma],
-+                                                                    c_tc, no_p, no_q);
-+                         } else
-++#ifdef RPI_DEBLOCK_VPU
-++                        if (s->enable_rpi_deblock) {
-++                            uint8_t (*setup)[2][2][4];
-++                            int xc = x>>s->ps.sps->hshift[chroma];
-++                            int yc = y>>s->ps.sps->vshift[chroma];
-++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
-++                            int a = ((xc>>3) & 1) << 1;
-++                            int b = (yc>>3) & 1;
-++                            setup = s->uv_setup_arm[num16];
-++                            setup[1][b][0][a] = c_tc[0];
-++                            setup[1][b][0][a + 1] = c_tc[1];
-++                        } else
-++#endif
-+                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
-+                                                                  s->frame->linesize[chroma],
-+                                                                  c_tc, no_p, no_q);
-+@@ -1013,33 +1040,56 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
-+ {
-+   // Flush image, 4 lines above to bottom of ctb stripe
-+-  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
-++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
-+   // TODO flush buffer of beta/tc setup when it becomes cached
-++
-++  // Prepare three commands at once to avoid calling overhead
-++  s->vpu_cmds_arm[0][0] = get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y;
-++  s->vpu_cmds_arm[0][1] = s->frame->linesize[0];
-++  s->vpu_cmds_arm[0][2] = s->setup_width;
-++  s->vpu_cmds_arm[0][3] = (int) ( s->y_setup_vc + s->setup_width * (y>>4) );
-++  s->vpu_cmds_arm[0][4] = ctb_size>>4;
-++  s->vpu_cmds_arm[0][5] = 2;
-++
-++  s->vpu_cmds_arm[1][0] = get_vc_address(s->frame->buf[1]) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
-++  s->vpu_cmds_arm[1][1] = s->frame->linesize[1];
-++  s->vpu_cmds_arm[1][2] = s->uv_setup_width;
-++  s->vpu_cmds_arm[1][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-++  s->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
-++  s->vpu_cmds_arm[1][5] = 3;
-++
-++  s->vpu_cmds_arm[2][0] = get_vc_address(s->frame->buf[2]) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
-++  s->vpu_cmds_arm[2][1] = s->frame->linesize[2];
-++  s->vpu_cmds_arm[2][2] = s->uv_setup_width;
-++  s->vpu_cmds_arm[2][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-++  s->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
-++  s->vpu_cmds_arm[2][5] = 4;
-++
-+   // Call VPU
-+-  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
-+-  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
-+-                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
-+-                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
-++  vpu_wait(vpu_post_code( vpu_get_fn(), s->vpu_cmds_vc, 3, 0, 0, 0, 5, 0)); // 5 means to do all the commands
-+ }
-+ 
-+-static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
-+-{
-+-   int y2;
-+-   for(y2=y;y2<y+ctb_size;y2+=16) {
-+-      rpi_deblock(s,y2,16);
-+-   }
-+-}
-+ #endif
-+ 
-+ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+ {
-+     int x_end = x >= s->ps.sps->width  - ctb_size;
-++#ifdef RPI_DEBLOCK_VPU
-++    int done_deblock = 0;
-++#endif
-+     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
-+         deblocking_filter_CTB(s, x, y);
-+ #ifdef RPI_DEBLOCK_VPU
-+     if (s->enable_rpi_deblock && x_end)
-+     {
-+-      rpi_deblock(s, y, ctb_size);
-++      int y_at_end = y >= s->ps.sps->height - ctb_size;
-++      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
-++      int y_start = y&~63;
-++      if (y_at_end) height = s->ps.sps->height - y_start;
-++      if ((((y+ctb_size)&63)==0) || y_at_end) {
-++        done_deblock = 1;
-++        rpi_deblock(s, y_start, height);
-++      }
-+     }
-+ #endif
-+     if (s->ps.sps->sao_enabled) {
-+@@ -1070,11 +1120,25 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+         //int newh = y + ctb_size - 4;
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-++#ifdef RPI_DEBLOCK_VPU
-++        if (s->enable_rpi_deblock) {
-++          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-++          if (done_deblock) {
-++            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-++          }
-++        } else {
-++#ifdef RPI_INTER_QPU
-++          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-++#endif
-++          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-++        }
-++#else
-+ #ifdef RPI_INTER_QPU
-+         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-+-        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-++        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-++#endif
-+     }
-+ }
-+ 
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index b3f155f..4309f1c 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -3,14 +3,32 @@ unsigned char rpi_hevc_transform [] = {
-+ 106,
-+ 0,
-+ 144,
-+-38,
-++47,
-+ 1,
-+ 37,
-+ 106,
-+ 0,
-+ 144,
-+-57,
-++66,
-+ 1,
-++53,
-++106,
-++0,
-++144,
-++192,
-++4,
-++69,
-++106,
-++0,
-++144,
-++192,
-++4,
-++85,
-++106,
-++0,
-++144,
-++220,
-++5,
-+ 169,
-+ 3,
-+ 62,
-+@@ -2427,4 +2445,626 @@ unsigned char rpi_hevc_transform [] = {
-+ 128,
-+ 90,
-+ 0,
-++169,
-++3,
-++14,
-++96,
-++4,
-++31,
-++169,
-++3,
-++30,
-++96,
-++1,
-++31,
-++73,
-++64,
-++52,
-++64,
-++45,
-++64,
-++2,
-++64,
-++10,
-++64,
-++64,
-++198,
-++1,
-++7,
-++8,
-++232,
-++63,
-++0,
-++0,
-++0,
-++6,
-++232,
-++253,
-++255,
-++255,
-++255,
-++0,
-++246,
-++0,
-++0,
-++0,
-++4,
-++215,
-++64,
-++3,
-++96,
-++2,
-++248,
-++0,
-++35,
-++0,
-++0,
-++64,
-++56,
-++0,
-++0,
-++4,
-++248,
-++0,
-++36,
-++0,
-++0,
-++64,
-++56,
-++8,
-++0,
-++0,
-++240,
-++64,
-++0,
-++132,
-++3,
-++30,
-++106,
-++132,
-++24,
-++128,
-++240,
-++0,
-++0,
-++132,
-++3,
-++128,
-++144,
-++143,
-++0,
-++131,
-++98,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++135,
-++0,
-++131,
-++102,
-++0,
-++158,
-++71,
-++0,
-++2,
-++248,
-++0,
-++35,
-++0,
-++0,
-++64,
-++56,
-++0,
-++0,
-++4,
-++248,
-++0,
-++36,
-++0,
-++0,
-++64,
-++56,
-++8,
-++0,
-++0,
-++240,
-++64,
-++0,
-++132,
-++3,
-++30,
-++106,
-++132,
-++24,
-++128,
-++240,
-++0,
-++0,
-++132,
-++3,
-++128,
-++144,
-++112,
-++0,
-++131,
-++98,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++104,
-++0,
-++131,
-++102,
-++0,
-++248,
-++64,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++30,
-++106,
-++134,
-++24,
-++128,
-++248,
-++0,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++144,
-++123,
-++0,
-++188,
-++64,
-++67,
-++232,
-++0,
-++2,
-++0,
-++0,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++112,
-++0,
-++195,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++128,
-++7,
-++192,
-++130,
-++248,
-++0,
-++0,
-++112,
-++192,
-++224,
-++16,
-++195,
-++31,
-++132,
-++248,
-++1,
-++0,
-++112,
-++0,
-++224,
-++16,
-++203,
-++31,
-++3,
-++99,
-++131,
-++71,
-++68,
-++232,
-++32,
-++0,
-++0,
-++0,
-++0,
-++99,
-++2,
-++99,
-++23,
-++102,
-++7,
-++106,
-++127,
-++156,
-++178,
-++255,
-++0,
-++248,
-++64,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++30,
-++106,
-++134,
-++24,
-++128,
-++248,
-++0,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++144,
-++72,
-++0,
-++188,
-++64,
-++67,
-++232,
-++0,
-++2,
-++0,
-++0,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++61,
-++0,
-++195,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++128,
-++7,
-++192,
-++130,
-++248,
-++0,
-++0,
-++112,
-++192,
-++224,
-++16,
-++195,
-++31,
-++132,
-++248,
-++1,
-++0,
-++112,
-++0,
-++224,
-++16,
-++203,
-++31,
-++25,
-++102,
-++9,
-++106,
-++2,
-++30,
-++41,
-++3,
-++26,
-++87,
-++162,
-++64,
-++64,
-++198,
-++1,
-++23,
-++127,
-++158,
-++95,
-++255,
-++239,
-++3,
-++0,
-++254,
-++128,
-++143,
-++94,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++143,
-++95,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++142,
-++208,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++142,
-++209,
-++0,
-++0,
-++240,
-++12,
-++0,
-++128,
-++144,
-++47,
-++0,
-++8,
-++255,
-++227,
-++23,
-++0,
-++244,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++35,
-++52,
-++0,
-++180,
-++192,
-++51,
-++0,
-++0,
-++111,
-++3,
-++239,
-++3,
-++0,
-++254,
-++128,
-++143,
-++14,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++143,
-++15,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++142,
-++16,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++142,
-++17,
-++0,
-++0,
-++240,
-++12,
-++0,
-++128,
-++144,
-++13,
-++0,
-++8,
-++255,
-++227,
-++3,
-++0,
-++244,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++35,
-++4,
-++0,
-++180,
-++192,
-++51,
-++0,
-++0,
-++111,
-++3,
-++32,
-++246,
-++192,
-++11,
-++1,
-++16,
-++32,
-++246,
-++2,
-++140,
-++47,
-++240,
-++32,
-++247,
-++35,
-++141,
-++63,
-++178,
-++64,
-++254,
-++35,
-++141,
-++2,
-++68,
-++192,
-++243,
-++128,
-++11,
-++32,
-++255,
-++35,
-++240,
-++58,
-++226,
-++192,
-++243,
-++188,
-++10,
-++0,
-++254,
-++0,
-++141,
-++4,
-++4,
-++0,
-++240,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++240,
-++246,
-++35,
-++141,
-++48,
-++66,
-++0,
-++247,
-++227,
-++143,
-++52,
-++242,
-++32,
-++247,
-++227,
-++142,
-++52,
-++178,
-++90,
-++0,
-++161,
-++3,
-++6,
-++64,
-++23,
-++64,
-++96,
-++8,
-++70,
-++98,
-++97,
-++8,
-++70,
-++98,
-++98,
-++8,
-++70,
-++98,
-++99,
-++8,
-++70,
-++98,
-++100,
-++8,
-++70,
-++98,
-++101,
-++8,
-++70,
-++98,
-++255,
-++159,
-++8,
-++250,
-++23,
-++102,
-++7,
-++106,
-++112,
-++30,
-++33,
-++3,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index b055208..5543093 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -85,6 +85,13 @@ hevc_trans_16x16:
-+   beq memclear16
-+   cmp r5,2
-+   beq hevc_deblock_16x16
-++  cmp r5,3
-++  beq hevc_uv_deblock_16x16
-++  cmp r5,4
-++  beq hevc_uv_deblock_16x16_with_clear
-++  cmp r5,5
-++  beq hevc_run_command_list
-++
-+   push r6-r15, lr # TODO cut down number of used registers
-+   mov r14,r3 # coeffs32
-+   mov r15,r4 # num32
-+@@ -708,3 +715,203 @@ normal_filtering:
-+ 
-+ filtering_done:
-+   b lr
-++
-++
-++hevc_uv_deblock_16x16:
-++  push r6-r15, lr
-++  mov r14,0
-++  b hevc_uv_start
-++hevc_uv_deblock_16x16_with_clear:
-++  push r6-r15, lr
-++  mov r14,1
-++  b hevc_uv_start
-++
-++hevc_uv_start:
-++  mov r9,r4
-++  mov r4,r3
-++  mov r13,r2
-++  mov r2,r0
-++  mov r10,r0
-++  subscale4 r0,r1
-++  mov r8,63
-++  mov r6,-3
-++  vmov H(zeros,0),0
-++# r7 is number of blocks still to load
-++# r0 is location of current block - 4 * stride
-++# r1 is stride
-++# r2 is location of current block
-++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-++# r4 is setup
-++# r5 is for temporary calculations
-++# r8 holds 63
-++# r6 holds -3
-++# r9 holds the number of 16 high rows to process
-++# r10 holds the original img base
-++# r11 returns 0 if no filtering was done on the edge
-++# r12 saves a copy of this
-++# r13 is copy of width
-++# r14 is 1 if we should clear the old contents, or 0 if not
-++
-++uv_process_row:
-++  # First iteration does not do horizontal filtering on previous
-++  mov r7, r13
-++  mov r3,0
-++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-++  vldb H(16++,16)+r3,(r2 += r1) REP 16
-++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-++  cmp r14,1
-++  bne uv_skip0
-++  vstb H(zeros,0),(r4)
-++uv_skip0:
-++  bl uv_vert_filter
-++  add r3,8
-++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-++  bl uv_vert_filter
-++  sub r3,8
-++  b uv_start_deblock_loop
-++uv_deblock_loop:
-++  # Middle iterations do vertical on current block and horizontal on preceding
-++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-++  vldb H(16++,16)+r3,(r2 += r1) REP 16
-++  vldb H(setup_input,0), (r4)
-++  cmp r14,1
-++  bne uv_skip1
-++  vstb H(zeros,0),(r4)
-++uv_skip1:
-++  bl uv_vert_filter
-++  add r3,8
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl uv_vert_filter
-++  sub r3,8
-++  vldb H(setup_input,0), -16(r4)
-++  cmp r14,1
-++  bne uv_skip3
-++  vstb H(zeros,0),-16(r4)
-++uv_skip3:
-++  bl uv_horz_filter
-++  mov r12,r11
-++  add r3,8*64
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl uv_horz_filter
-++  sub r3,8*64
-++  addcmpbeq r12,0,0,uv_skip_save_top
-++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-++uv_skip_save_top:
-++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-++uv_start_deblock_loop:
-++  # move onto next 16x16 (could do this with circular buffer support instead)
-++  add r3,16
-++  and r3,r8
-++  add r4,32
-++  # Perform loop counter operations (may work with an addcmpbgt as well?)
-++  add r0,16
-++  add r2,16
-++  sub r7,1
-++  cmp r7,0 # Are there still more blocks to load
-++  bgt uv_deblock_loop
-++
-++  # Final iteration needs to just do horizontal filtering
-++  vldb H(setup_input,0), -16(r4)
-++  cmp r14,1
-++  bne uv_skip2
-++  vstb H(zeros,0),-16(r4)
-++uv_skip2:
-++  bl uv_horz_filter
-++  mov r12,r11
-++  add r3,8*64
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl uv_horz_filter
-++  sub r3,64*8
-++  addcmpbeq r12,0,0,uv_skip_save_top2
-++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-++uv_skip_save_top2:
-++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-++
-++# Now look to see if we should do another row
-++  sub r9,1
-++  cmp r9,0
-++  bgt uv_start_again
-++  pop r6-r15, pc
-++uv_start_again:
-++  # Need to sort out r0,r2 to point to next row down
-++  addscale16 r10,r1
-++  mov r2,r10
-++  subscale4 r0,r2,r1
-++  b uv_process_row
-++
-++
-++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-++
-++uv_vert_filter:
-++  push lr
-++
-++  vmov HX(P1,0), V(16,14)+r3
-++  vmov HX(P0,0), V(16,15)+r3
-++  vmov HX(Q0,0), V(16,16)+r3
-++  vmov HX(Q1,0), V(16,17)+r3
-++
-++  bl do_chroma_filter
-++
-++  vadds V(16,15)+r3, HX(P0,0), 0
-++  vadds V(16,16)+r3, HX(Q0,0), 0
-++
-++  pop pc
-++
-++# Filter edge at H(16,0)+r3
-++uv_horz_filter:
-++  push lr
-++
-++  vmov HX(P1,0), H(14,0)+r3
-++  vmov HX(P0,0), H(15,0)+r3
-++  vmov HX(Q0,0), H(16,0)+r3
-++  vmov HX(Q1,0), H(17,0)+r3
-++
-++  bl do_chroma_filter
-++
-++  vadds H(15,0)+r3, HX(P0,0), 0
-++  # P3 and Q3 never change so don't bother saving back
-++  vadds H(16,0)+r3, HX(Q0,0), 0
-++
-++  pop pc
-++
-++# r4 points to array of beta/tc for each 4 length edge
-++do_chroma_filter:
-++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
-++  valtl HX(tc,0),H(setup,0),H(setup,0)
-++
-++  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
-++  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
-++  vsub -,HX(P1,0),HX(Q1,0) SACC
-++  vmov HX(delta,0),4 SACC
-++  vasr HX(delta,0),HX(delta,0),3
-++  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
-++  vadd HX(P0,0),HX(P0,0),HX(delta,0)
-++  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
-++  b lr
-++
-++# r0 = list
-++# r1 = number
-++hevc_run_command_list:
-++  push r6-r7, lr
-++  mov r6, r0
-++  mov r7, r1
-++loop_cmds:
-++  ld r0,(r6) # How to encode r6++?
-++  add r6,4
-++  ld r1,(r6)
-++  add r6,4
-++  ld r2,(r6)
-++  add r6,4
-++  ld r3,(r6)
-++  add r6,4
-++  ld r4,(r6)
-++  add r6,4
-++  ld r5,(r6)
-++  add r6,4
-++  bl hevc_trans_16x16
-++  sub r7,1
-++  cmp r7,0
-++  bgt loop_cmds
-++
-++  pop r6-r7, pc
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 91777be..5aa0432 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -397,6 +397,8 @@ static void *vpu_start(void *arg) {
-+   int start_time;
-+   int end_time;
-+   int count=0;
-++  int count_deblock=0;
-++  int count_qpu=0;
-+ #endif
-+   while(1) {
-+     int i;
-+@@ -442,7 +444,7 @@ static void *vpu_start(void *arg) {
-+         break;
-+       }
-+     }
-+-    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
-++    //printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
-+ #endif
-+     qpu_code = p[7];
-+     qpu_codeb = p[16];
-+@@ -460,6 +462,12 @@ static void *vpu_start(void *arg) {
-+     off_time += start_time-last_time;
-+ #endif
-+ 
-++#define NO_FLUSH 1
-++#define CLEAR_PROFILE 2
-++#define OUTPUT_COUNTS 4
-++
-++#define FLAGS_FOR_PROFILING (NO_FLUSH)
-++
-+ #ifdef RPI_COMBINE_JOBS
-+     if (have_qpu) {
-+       for(i=0;i<8;i++) {
-+@@ -472,14 +480,14 @@ static void *vpu_start(void *arg) {
-+       }
-+       if (have_vpu) {
-+         execute_multi(gpu->mb,
-+-                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
-+                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                               q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
-+         q[0] = 0;
-+       } else {
-+         execute_multi(gpu->mb,
-+-                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
-+                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+@@ -510,7 +518,7 @@ static void *vpu_start(void *arg) {
-+       execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
-+ #else
-+       execute_multi(gpu->mb,
-+-                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING , 5000,
-+                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+@@ -525,17 +533,20 @@ static void *vpu_start(void *arg) {
-+     // There are three cases we may wish to distinguish of VPU/QPU activity
-+     on_time += end_time - start_time;
-+ #else
-+-    if (p[6]==2)
-++    if (p[6]>1) {
-++      count_deblock++;
-+       on_time_deblock += end_time - start_time;
-+-    else
-++    } else {
-+       on_time += end_time - start_time;
-++      count_qpu++;
-++    }
-+ #endif
-+     count++;
-+     if ((count&0x7f)==0)
-+ #ifdef RPI_COMBINE_JOBS
-+-      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
-+-#else
-+       printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-++#else
-++      printf("Posted %d On=%dms (%d calls), On_deblock=%dms (%d calls), Off=%dms\n",count,(int)(on_time/1000),count_qpu,(int)(on_time_deblock/1000),count_deblock,(int)(off_time/1000));
-+ #endif
-+ #endif
-+ job_done_early:
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 0686249..64bf5b0 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -1077,6 +1077,17 @@ nop        ; nop # delay slot 2
-+ ::mc_interrupt_exit12
-+   mov  -, vw_wait # wait on the VDW
-+ 
-++  # Dummy wait to test instructions
-++#  mov r3,1000000
-++#:dummy_loop
-++#  sub.setf r3, r3, 1
-++#  nop
-++#  nop
-++#  brr.anynn -, r:dummy_loop
-++#  nop
-++#  nop
-++#  nop
-++
-+   ldtmu0
-+   ldtmu0
-+   ldtmu1
-+-- 
-+2.5.0
-+
-+
-+From f7a8b294a317dc015d19a342aa20a994a299676d Mon Sep 17 00:00:00 2001
-+From: Ben Avison <bavison@riscosopen.org>
-+Date: Tue, 23 Jun 2015 23:42:03 +0100
-+Subject: [PATCH 67/68] armv7/hevc: Optimise deblocking boundary strength
-+ calculation
-+
-+---
-+ libavcodec/arm/hevcdsp_deblock_neon.S | 115 +++++++++++++++++
-+ libavcodec/arm/hevcdsp_init_neon.c    |   9 ++
-+ libavcodec/hevc.h                     |  11 --
-+ libavcodec/hevc_filter.c              | 224 ++++++++++++++--------------------
-+ libavcodec/hevcdsp.c                  | 116 ++++++++++++++++++
-+ libavcodec/hevcdsp.h                  |  14 +++
-+ 6 files changed, 344 insertions(+), 145 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-+index 166bddb..bad4589 100644
-+--- a/libavcodec/arm/hevcdsp_deblock_neon.S
-++++ b/libavcodec/arm/hevcdsp_deblock_neon.S
-+@@ -383,3 +383,118 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
-+         vst1.8   {d4}, [r0]
-+         bx       lr
-+ endfunc
-++
-++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-++ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-++ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
-++ */
-++function ff_hevc_deblocking_boundary_strengths_neon, export=1
-++        add         ip, sp, #4*4
-++        push        {a2-a4,v1-v8,lr}
-++        ldmia       ip, {v5-v7}
-++1:      ldmdb       ip, {v1-v4}
-++        ldrsb       a3, [v5, #8]    @ curr->ref_idx
-++        ldrsb       v8, [v5, #9]
-++        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
-++        ldrsb       lr, [v6, #9]
-++        ldr         v1, [v1, a3, lsl #2]
-++        ldrb        a3, [v5, #10]   @ curr->pred_flag
-++        ldr         v2, [v2, v8, lsl #2]
-++        ldrb        v8, [v6, #10]   @ neigh->pred_flag
-++        ldr         v3, [v3, ip, lsl #2]
-++        ldr         v4, [v4, lr, lsl #2]
-++        teq         a3, #3
-++        beq         20f
-++        teq         v8, #3
-++        beq         90f
-++
-++        tst         a3, #1
-++        ldrne       a3, [v5, #0]    @ curr->mv[0]
-++        ldreq       a3, [v5, #4]    @ curr->mv[1]
-++        moveq       v1, v2
-++        tst         v8, #1
-++        ldrne       v8, [v6, #0]    @ neigh->mv[0]
-++        ldreq       v8, [v6, #4]    @ neigh->mv[1]
-++        moveq       v3, v4
-++        teq         v1, v3
-++        bne         10f
-++        ldr         lr, =0xFFFCFFFC
-++        ssub16      ip, v8, a3
-++        ssub16      a3, a3, v8
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        @ drop through
-++10:     movne       a3, #1
-++11:     subs        a2, a2, #1
-++12:     strbhs      a3, [v7], a4
-++        subs        a2, a2, #1
-++        bhs         12b
-++
-++        ldm         sp, {a2, a3}
-++        add         ip, sp, #16*4
-++        subs        a1, a1, #1
-++        add         v5, v5, a3
-++        add         v6, v6, a3
-++        bhi         1b
-++        pop         {a2-a4,v1-v8,pc}
-++
-++20:     teq         v8, #3
-++        bne         10b
-++
-++        teq         v1, v3
-++        teqeq       v2, v4
-++        bne         40f
-++        teq         v1, v2
-++        bne         30f
-++
-++        ldrd        v1, v2, [v5]    @ curr->mv
-++        ldrd        v3, v4, [v6]    @ neigh->mv
-++        ldr         lr, =0xFFFCFFFC
-++        ssub16      ip, v3, v1
-++        ssub16      a3, v1, v3
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        bne         25f
-++        ssub16      ip, v4, v2
-++        ssub16      a3, v2, v4
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        beq         11b
-++        @ drop through
-++25:     ssub16      ip, v4, v1
-++        ssub16      a3, v1, v4
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        bne         10b
-++        ssub16      ip, v3, v2
-++        ssub16      a3, v2, v3
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        b           10b
-++
-++30:     ldrd        v1, v2, [v5]    @ curr->mv
-++        ldrd        v3, v4, [v6]    @ neigh->mv
-++        ldr         lr, =0xFFFCFFFC
-++        ssub16      ip, v3, v1
-++        ssub16      a3, v1, v3
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        bne         10b
-++        ssub16      ip, v4, v2
-++        ssub16      a3, v2, v4
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        b           10b
-++
-++40:     teq         v1, v4
-++        teqeq       v2, v3
-++        bne         10b
-++
-++        ldrd        v1, v2, [v5]    @ curr->mv
-++        ldrd        v3, v4, [v6]    @ neigh->mv
-++        ldr         lr, =0xFFFCFFFC
-++        b           25b
-++
-++90:     mov         a3, #1
-++        b           11b
-++endfunc
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index e5da7e9..49c70dd 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -290,6 +290,10 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+ }
-+ #undef CMP
-+ 
-++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-++                                                MvField *curr, MvField *neigh, uint8_t *bs);
-++
-+ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+ {
-+     if (bit_depth == 8) {
-+@@ -387,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
-+         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
-+     }
-++
-++    assert(offsetof(MvField, mv) == 0);
-++    assert(offsetof(MvField, ref_idx) == 8);
-++    assert(offsetof(MvField, pred_flag) == 10);
-++    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
-+ }
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 6675a4f..10fbccc 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -683,17 +683,6 @@ typedef struct CodingUnit {
-+     uint8_t cu_transquant_bypass_flag;
-+ } CodingUnit;
-+ 
-+-typedef struct Mv {
-+-    int16_t x;  ///< horizontal component of motion vector
-+-    int16_t y;  ///< vertical component of motion vector
-+-} Mv;
-+-
-+-typedef struct MvField {
-+-    DECLARE_ALIGNED(4, Mv, mv)[2];
-+-    int8_t ref_idx[2];
-+-    int8_t pred_flag;
-+-} MvField;
-+-
-+ typedef struct NeighbourAvailable {
-+     int cand_bottom_left;
-+     int cand_left;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 6367068..826a82f 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -726,69 +726,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+     }
-+ }
-+ 
-+-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
-+-                             RefPicList *neigh_refPicList)
-+-{
-+-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-+-        // same L0 and L1
-+-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
-+-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
-+-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
-+-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-+-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
-+-                return 1;
-+-            else
-+-                return 0;
-+-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-+-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-+-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
-+-                return 1;
-+-            else
-+-                return 0;
-+-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-+-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-+-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
-+-                return 1;
-+-            else
-+-                return 0;
-+-        } else {
-+-            return 1;
-+-        }
-+-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+-        Mv A, B;
-+-        int ref_A, ref_B;
-+-
-+-        if (curr->pred_flag & 1) {
-+-            A     = curr->mv[0];
-+-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
-+-        } else {
-+-            A     = curr->mv[1];
-+-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
-+-        }
-+-
-+-        if (neigh->pred_flag & 1) {
-+-            B     = neigh->mv[0];
-+-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
-+-        } else {
-+-            B     = neigh->mv[1];
-+-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
-+-        }
-+-
-+-        if (ref_A == ref_B) {
-+-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
-+-                return 1;
-+-            else
-+-                return 0;
-+-        } else
-+-            return 1;
-+-    }
-+-
-+-    return 1;
-+-}
-+ 
-+ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+                                            int log2_trafo_size)
-+@@ -799,10 +736,17 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
-+     int min_pu_width     = s->ps.sps->min_pu_width;
-+     int min_tu_width     = s->ps.sps->min_tb_width;
-+-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
-+-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
-+     int boundary_upper, boundary_left;
-+-    int i, j, bs;
-++    int i, j;
-++    RefPicList *rpl      = s->ref->refPicList;
-++    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
-++    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
-++    int y_pu             = y0 >> log2_min_pu_size;
-++    int x_pu             = x0 >> log2_min_pu_size;
-++    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
-++    int is_intra         = curr->pred_flag == PF_INTRA;
-++    int inc              = log2_min_pu_size == 2 ? 2 : 1;
-++    uint8_t *bs;
-+ 
-+ #ifdef DISABLE_STRENGTHS
-+     return;
-+@@ -818,34 +762,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
-+         boundary_upper = 0;
-+ 
-++    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
-++
-+     if (boundary_upper) {
-+         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
-+                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
-+-                              s->ref->refPicList;
-+-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
-+-        int yq_pu =  y0      >> log2_min_pu_size;
-+-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
-+-        int yq_tu =  y0      >> log2_min_tu_size;
-++                              rpl;
-++        MvField *top = curr - min_pu_width;
-++
-++        if (is_intra) {
-++            for (i = 0; i < (1 << log2_trafo_size); i += 4)
-++                bs[i >> 2] = 2;
-++
-++        } else {
-++            int y_tu = y0 >> log2_min_tu_size;
-++            int x_tu = x0 >> log2_min_tu_size;
-++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
-++            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
-++
-++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
-++                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
-++                    curr, top, bs);
-+ 
-+             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-+-                int x_pu = (x0 + i) >> log2_min_pu_size;
-+-                int x_tu = (x0 + i) >> log2_min_tu_size;
-+-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-+-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-+-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
-+-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-+-
-+-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
-+-                    bs = 2;
-+-                else if (curr_cbf_luma || top_cbf_luma)
-+-                    bs = 1;
-+-                else
-+-                    bs = boundary_strength(s, curr, top, rpl_top);
-+-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
-++                int i_pu = i >> log2_min_pu_size;
-++                int i_tu = i >> log2_min_tu_size;
-++
-++                if (top[i_pu].pred_flag == PF_INTRA)
-++                    bs[i >> 2] = 2;
-++                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
-++                    bs[i >> 2] = 1;
-+             }
-++        }
-++    }
-++
-++    if (!is_intra) {
-++        for (j = inc; j < trafo_in_min_pus; j += inc) {
-++            MvField *top;
-++
-++            curr += min_pu_width * inc;
-++            top = curr - min_pu_width;
-++            bs += s->bs_width * inc << log2_min_pu_size >> 2;
-++
-++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
-++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
-++                    curr, top, bs);
-++        }
-+     }
-+ 
-+-    // bs for vertical TU boundaries
-+     boundary_left = x0 > 0 && !(x0 & 7);
-+     if (boundary_left &&
-+         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-+@@ -856,64 +822,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
-+         boundary_left = 0;
-+ 
-++    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
-++    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
-++
-+     if (boundary_left) {
-+         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
-+                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
-+-                               s->ref->refPicList;
-+-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
-+-        int xq_pu =  x0      >> log2_min_pu_size;
-+-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
-+-        int xq_tu =  x0      >> log2_min_tu_size;
-+-
-+-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-+-                int y_pu      = (y0 + i) >> log2_min_pu_size;
-+-                int y_tu      = (y0 + i) >> log2_min_tu_size;
-+-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-+-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-+-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-+-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
-+-
-+-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
-+-                    bs = 2;
-+-                else if (curr_cbf_luma || left_cbf_luma)
-+-                    bs = 1;
-+-                else
-+-                    bs = boundary_strength(s, curr, left, rpl_left);
-+-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
-+-            }
-+-    }
-++                               rpl;
-++        MvField *left = curr - 1;
-+ 
-+-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
-+-        RefPicList *rpl = s->ref->refPicList;
-++        if (is_intra) {
-++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
-++                bs[j * s->bs_width >> 2] = 2;
-+ 
-+-        // bs for TU internal horizontal PU boundaries
-+-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
-+-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
-+-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
-+-
-+-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-+-                int x_pu = (x0 + i) >> log2_min_pu_size;
-+-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-+-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-+-
-+-                bs = boundary_strength(s, curr, top, rpl);
-+-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
-++        } else {
-++            int y_tu = y0 >> log2_min_tu_size;
-++            int x_tu = x0 >> log2_min_tu_size;
-++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
-++            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
-++
-++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
-++                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
-++                    curr, left, bs);
-++
-++            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
-++                int j_pu = j >> log2_min_pu_size;
-++                int j_tu = j >> log2_min_tu_size;
-++
-++                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
-++                    bs[j * s->bs_width >> 2] = 2;
-++                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
-++                    bs[j * s->bs_width >> 2] = 1;
-+             }
-+         }
-++    }
-+ 
-+-        // bs for TU internal vertical PU boundaries
-+-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
-+-            int y_pu = (y0 + j) >> log2_min_pu_size;
-++    if (!is_intra) {
-++        for (i = inc; i < trafo_in_min_pus; i += inc) {
-++            MvField *left;
-+ 
-+-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
-+-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
-+-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
-+-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-+-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-++            curr += inc;
-++            left = curr - 1;
-++            bs += inc << log2_min_pu_size >> 2;
-+ 
-+-                bs = boundary_strength(s, curr, left, rpl);
-+-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
-+-            }
-++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
-++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
-++                    curr, left, bs);
-+         }
-+     }
-+ }
-+diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-+index 9d773d9..a6534a9 100644
-+--- a/libavcodec/hevcdsp.c
-++++ b/libavcodec/hevcdsp.c
-+@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
-+ #include "hevcdsp_template.c"
-+ #undef BIT_DEPTH
-+ 
-++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
-++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-++                                               MvField *curr, MvField *neigh, uint8_t *bs)
-++{
-++    for (; pus > 0; pus--) {
-++        int strength, out;
-++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
-++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-++        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
-++        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
-++
-++#if 1 // This more directly matches the original implementation
-++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-++            // same L0 and L1
-++            if (curr_refL0 == neigh_refL0 &&
-++                curr_refL0 == curr_refL1 &&
-++                neigh_refL0 == neigh_refL1) {
-++                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-++                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-++                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-++                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
-++                    strength = 1;
-++                else
-++                    strength = 0;
-++            } else if (neigh_refL0 == curr_refL0 &&
-++                       neigh_refL1 == curr_refL1) {
-++                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-++                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
-++                    strength = 1;
-++                else
-++                    strength = 0;
-++            } else if (neigh_refL1 == curr_refL0 &&
-++                       neigh_refL0 == curr_refL1) {
-++                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-++                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
-++                    strength = 1;
-++                else
-++                    strength = 0;
-++            } else {
-++                strength = 1;
-++            }
-++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-++            Mv curr_mv0, neigh_mv0;
-++
-++            if (curr->pred_flag & 1) {
-++                curr_mv0   = curr->mv[0];
-++            } else {
-++                curr_mv0   = curr->mv[1];
-++                curr_refL0 = curr_refL1;
-++            }
-++
-++            if (neigh->pred_flag & 1) {
-++                neigh_mv0   = neigh->mv[0];
-++            } else {
-++                neigh_mv0   = neigh->mv[1];
-++                neigh_refL0 = neigh_refL1;
-++            }
-++
-++            if (curr_refL0 == neigh_refL0) {
-++                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
-++                    strength = 1;
-++                else
-++                    strength = 0;
-++            } else
-++                strength = 1;
-++        } else
-++            strength = 1;
-++#else // This has exactly the same effect, but is more suitable for vectorisation
-++        Mv curr_mv[2];
-++        Mv neigh_mv[2];
-++        memcpy(curr_mv, curr->mv, sizeof curr_mv);
-++        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
-++
-++        if (!(curr->pred_flag & 2)) {
-++            curr_mv[1] = curr_mv[0];
-++            curr_refL1 = curr_refL0;
-++        }
-++        if (!(neigh->pred_flag & 2)) {
-++            neigh_mv[1] = neigh_mv[0];
-++            neigh_refL1 = neigh_refL0;
-++        }
-++        if (!(curr->pred_flag & 1)) {
-++            curr_mv[0] = curr_mv[1];
-++            curr_refL0 = curr_refL1;
-++        }
-++        if (!(neigh->pred_flag & 1)) {
-++            neigh_mv[0] = neigh_mv[1];
-++            neigh_refL0 = neigh_refL1;
-++        }
-++
-++        strength = 1;
-++
-++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-++                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
-++                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
-++
-++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-++                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
-++                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
-++
-++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
-++#endif
-++
-++        curr += in_inc / sizeof (MvField);
-++        neigh += in_inc / sizeof (MvField);
-++
-++        for (out = dup; out > 0; out--)
-++        {
-++            *bs = strength;
-++            bs += out_inc;
-++        }
-++    }
-++}
-++
-+ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-+ {
-+ #undef FUNC
-+@@ -257,6 +371,8 @@ int i = 0;
-+         break;
-+     }
-+ 
-++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
-++
-+     if (ARCH_X86)
-+         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
-+     if (ARCH_ARM)
-+diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-+index 9f1f6dd..e221e54 100644
-+--- a/libavcodec/hevcdsp.h
-++++ b/libavcodec/hevcdsp.h
-+@@ -42,6 +42,17 @@ typedef struct SAOParams {
-+     uint8_t type_idx[3];    ///< sao_type_idx
-+ } SAOParams;
-+ 
-++typedef struct Mv {
-++    int16_t x;  ///< horizontal component of motion vector
-++    int16_t y;  ///< vertical component of motion vector
-++} Mv;
-++
-++typedef struct MvField {
-++    DECLARE_ALIGNED(4, Mv, mv)[2];
-++    int8_t ref_idx[2];
-++    int8_t pred_flag;
-++} MvField;
-++
-+ typedef struct HEVCDSPContext {
-+     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                     struct GetBitContext *gb, int pcm_bit_depth);
-+@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
-+     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                         int32_t *tc, uint8_t *no_p,
-+                                         uint8_t *no_q);
-++    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
-++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-++                                               MvField *curr, MvField *neigh, uint8_t *bs);
-+ } HEVCDSPContext;
-+ 
-+ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-+-- 
-+2.5.0
-+
-+
-+From 95c6d1107c1dc60fd40abeb9eadb69b3937ce9f5 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 15 Jul 2015 09:09:11 +0100
-+Subject: [PATCH 68/68] Only enable qpu when needed
-+
-+---
-+ libavcodec/hevc.h    |  2 +-
-+ libavcodec/rpi_qpu.c | 21 ++++++++++++++++-----
-+ 2 files changed, 17 insertions(+), 6 deletions(-)
-+
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 10fbccc..a8ff7b8 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -56,7 +56,7 @@
-+   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+   #define RPI_WORKER
-+   // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
-+-  #define RPI_DEBLOCK_VPU
-++  //#define RPI_DEBLOCK_VPU
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 5aa0432..ffd13ca 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -9,7 +9,7 @@
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+ #define RPI_ASYNC
-+ // Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
-+-#define RPI_COMBINE_JOBS
-++//#define RPI_COMBINE_JOBS
-+ 
-+ #include <stdio.h>
-+ #include <stdlib.h>
-+@@ -143,9 +143,9 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   volatile struct GPU* ptr;
-+ 	if (mb < 0)
-+ 		return -1;
-+-
-++#ifndef RPI_ASYNC
-+ 	if (qpu_enable(mb, 1)) return -2;
-+-
-++#endif
-+   vcsm_init();
-+   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+@@ -336,9 +336,9 @@ static void gpu_term(void)
-+     vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
-+     pthread_join(vpu_thread, &res);
-+   }
-+-#endif
-+-
-++#else
-+   qpu_enable(mb, 0);
-++#endif
-+   gpu_free_internal(&gpu_mem_ptr);
-+ 
-+   vcsm_exit();
-+@@ -400,6 +400,7 @@ static void *vpu_start(void *arg) {
-+   int count_deblock=0;
-+   int count_qpu=0;
-+ #endif
-++  int qpu_started = 0;
-+   while(1) {
-+     int i;
-+     int *p; // Pointer for a QPU/VPU job
-+@@ -427,6 +428,12 @@ static void *vpu_start(void *arg) {
-+     if (p[7] == 0 && p[0] == 0 && p[16]==0)
-+       goto job_done_early;
-+ 
-++    if (!qpu_started) {
-++      int result = qpu_enable(gpu->mb, 1);
-++      av_assert0(result==0);
-++      qpu_started = 1;
-++    }
-++
-+ #ifdef RPI_COMBINE_JOBS
-+     // First scan for a qpu job
-+     for (int x=0;x<num_jobs;x++) {
-+@@ -556,6 +563,10 @@ job_done_early:
-+     pthread_mutex_unlock(&post_mutex);
-+   }
-+ 
-++  if (qpu_started) {
-++    qpu_enable(gpu->mb, 0);
-++  }
-++
-+   return NULL;
-+ }
-+ 
-+-- 
-+2.5.0
-+
-
-From aa8268363b74f1b9ed6d6801d379bc08a85eead2 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 14 Dec 2015 12:35:14 +0000
-Subject: [PATCH 63/93] [build] Add patches to ffmpeg for native build
-
----
- tools/depends/target/ffmpeg/autobuild.sh | 8 ++++++++
- 1 file changed, 8 insertions(+)
-
-diff --git a/tools/depends/target/ffmpeg/autobuild.sh b/tools/depends/target/ffmpeg/autobuild.sh
-index b9bfd57..f6d4c3b 100755
---- a/tools/depends/target/ffmpeg/autobuild.sh
-+++ b/tools/depends/target/ffmpeg/autobuild.sh
-@@ -125,6 +125,14 @@ mkdir ffmpeg-${VERSION}
- cd ffmpeg-${VERSION} || exit 2
- tar --strip-components=1 -xf ../${ARCHIVE}
- 
-+patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
-+patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-+patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-+patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
-+patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
-+patch -p1 < ../pfcd_hevc_optimisations.patch
-+patch -p1 < ../add_h264_MVC_support.patch
-+
- CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
- ./configure --prefix=$FFMPEG_PREFIX \
- 	--extra-version="kodi-${VERSION}" \
-
-From 31e2cf35741edf914f5413668c158186f9310197 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 12 Jan 2016 16:29:57 +0000
-Subject: [PATCH 64/93] ffmpeg: Add cabac opimisations for hevc
-
----
- .../0001-Squashed-commit-of-the-following.patch    | 2288 ++++++++++++++++++++
- tools/depends/target/ffmpeg/Makefile               |    5 +-
- 2 files changed, 2292 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch
-
-diff --git a/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch b/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch
-new file mode 100644
-index 0000000..adb584b
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch
-@@ -0,0 +1,2288 @@
-+From 9421229d7f8e6ef6cfb8a1b30f731f91c6586aca Mon Sep 17 00:00:00 2001
-+From: John Cox <jc@kynesim.co.uk>
-+Date: Wed, 13 Jan 2016 16:13:33 +0000
-+Subject: [PATCH] H.265 residual decode rework (v2)
-+
-+Rework the cabac decode functions
-+Simplify the code flow and variable usage where possible
-+
-+(Remove profiling and other spurious deltas that were in v1)
-+---
-+ libavcodec/arm/cabac.h                |  155 ++++-
-+ libavcodec/arm/hevc_cabac.h           |  491 +++++++++++++++
-+ libavcodec/arm/hevcdsp_deblock_neon.S |   13 +-
-+ libavcodec/arm/hevcdsp_epel_neon.S    |    9 +-
-+ libavcodec/cabac.c                    |   11 +-
-+ libavcodec/cabac.h                    |    9 +-
-+ libavcodec/cabac_functions.h          |   15 +-
-+ libavcodec/hevc_cabac.c               | 1098 +++++++++++++++++++++++++--------
-+ 8 files changed, 1534 insertions(+), 267 deletions(-)
-+ create mode 100644 libavcodec/arm/hevc_cabac.h
-+
-+diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-+index fdbf86b..0a3980a 100644
-+--- a/libavcodec/arm/cabac.h
-++++ b/libavcodec/arm/cabac.h
-+@@ -26,13 +26,34 @@
-+ #include "libavutil/internal.h"
-+ #include "libavcodec/cabac.h"
-+ 
-++
-++#if UNCHECKED_BITSTREAM_READER
-++#define LOAD_16BITS_BEHI\
-++        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
-++        "rev        %[tmp]        , %[tmp]                      \n\t"
-++#elif CONFIG_THUMB
-++#define LOAD_16BITS_BEHI\
-++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
-++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
-++        "it         cs                                          \n\t"\
-++        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
-++        "rev        %[tmp]        , %[tmp]                      \n\t"
-++#else
-++#define LOAD_16BITS_BEHI\
-++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
-++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
-++        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
-++        "rev        %[tmp]        , %[tmp]                      \n\t"
-++#endif
-++
-++
-+ #define get_cabac_inline get_cabac_inline_arm
-+ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-+                                                  uint8_t *const state)
-+ {
-+     int bit;
-++#if 0
-+     void *reg_b, *reg_c, *tmp;
-+-
-+     __asm__ volatile(
-+         "ldrb       %[bit]        , [%[state]]                  \n\t"
-+         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
-+@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-+           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
-+         : "memory", "cc"
-+         );
-++#else
-++   // *** Not thumb compatible yet
-++   unsigned int reg_b, tmp;
-++    __asm__ (
-++        "ldrb       %[bit]        , [%[state]]                  \n\t"
-++        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-++        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-++        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-++        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
-++// %bit = *state
-++// %range = range
-++// %tmp = RangeLPS
-++        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-++        "ittt       ge                                          \n\t"
-++        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++        "mvnge      %[bit]        , %[bit]                      \n\t"
-++        "movge      %[range]      , %[tmp]                      \n\t"
-++
-++        "clz        %[tmp]        , %[range]                    \n\t"
-++        "sub        %[tmp]        , #23                         \n\t"
-++
-++        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-++        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-++        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++        "strb       %[r_b]        , [%[state]]                  \n\t"
-++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++
-++        "bne        2f                                          \n\t"
-++        LOAD_16BITS_BEHI
-++        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
-++        "movw       %[r_b]        , #0xFFFF                     \n\t"
-++        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-++
-++        "rbit       %[r_b]        , %[low]                      \n\t"
-++        "clz        %[r_b]        , %[r_b]                      \n\t"
-++        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-++#if CONFIG_THUMB
-++        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-++        "add        %[low]        , %[low]      , %[tmp]        \n\t"
-++#else
-++        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-++#endif
-++        "2:                                                     \n\t"
-++        :    [bit]"=&r"(bit),
-++             [low]"+&r"(c->low),
-++           [range]"+&r"(c->range),
-++             [r_b]"=&r"(reg_b),
-++             [ptr]"+&r"(c->bytestream),
-++             [tmp]"=&r"(tmp)
-++          :  [state]"r"(state),
-++            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-++              [byte]"M"(offsetof(CABACContext, bytestream)),
-++#if !UNCHECKED_BITSTREAM_READER
-++                 [c]"r"(c),
-++               [end]"M"(offsetof(CABACContext, bytestream_end)),
-++#endif
-++           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-++        : "memory", "cc"
-++        );
-++#endif
-+ 
-+     return bit & 1;
-+ }
-++
-++#define get_cabac_bypass get_cabac_bypass_arm
-++static inline int get_cabac_bypass_arm(CABACContext * const c)
-++{
-++    int rv = 0;
-++    unsigned int tmp;
-++    __asm (
-++        "lsl        %[low]        , #1                          \n\t"
-++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-++        "adc        %[rv]         , %[rv]       , #0            \n\t"
-++        "it         cs                                          \n\t"
-++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++        "bne        1f                                          \n\t"
-++        LOAD_16BITS_BEHI
-++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
-++        "movw       %[tmp]        , #0xFFFF                     \n\t"
-++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
-++        "1:                                                     \n\t"
-++        : // Outputs
-++              [rv]"+&r"(rv),
-++             [low]"+&r"(c->low),
-++             [tmp]"=&r"(tmp),
-++             [ptr]"+&r"(c->bytestream)
-++        : // Inputs
-++#if !UNCHECKED_BITSTREAM_READER
-++                 [c]"r"(c),
-++               [end]"M"(offsetof(CABACContext, bytestream_end)),
-++#endif
-++             [range]"r"(c->range)
-++        : "cc"
-++    );
-++    return rv;
-++}
-++
-++
-++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
-++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
-++{
-++    unsigned int tmp;
-++    __asm (
-++        "lsl        %[low]        , #1                          \n\t"
-++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-++        "ite        cc                                          \n\t"
-++        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
-++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++        "bne        1f                                          \n\t"
-++        LOAD_16BITS_BEHI
-++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
-++        "movw       %[tmp]        , #0xFFFF                     \n\t"
-++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
-++        "1:                                                     \n\t"
-++        : // Outputs
-++              [rv]"+&r"(rv),
-++             [low]"+&r"(c->low),
-++             [tmp]"=&r"(tmp),
-++             [ptr]"+&r"(c->bytestream)
-++        : // Inputs
-++#if !UNCHECKED_BITSTREAM_READER
-++                 [c]"r"(c),
-++               [end]"M"(offsetof(CABACContext, bytestream_end)),
-++#endif
-++             [range]"r"(c->range)
-++        : "cc"
-++    );
-++    return rv;
-++}
-++
-+ #endif /* HAVE_ARMV6T2_INLINE */
-+ 
-+ #endif /* AVCODEC_ARM_CABAC_H */
-+diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
-+new file mode 100644
-+index 0000000..31d3c59
-+--- /dev/null
-++++ b/libavcodec/arm/hevc_cabac.h
-+@@ -0,0 +1,491 @@
-++/*
-++ * This file is part of FFmpeg.
-++ *
-++ * FFmpeg is free software; you can redistribute it and/or
-++ * modify it under the terms of the GNU Lesser General Public
-++ * License as published by the Free Software Foundation; either
-++ * version 2.1 of the License, or (at your option) any later version.
-++ *
-++ * FFmpeg is distributed in the hope that it will be useful,
-++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-++ * Lesser General Public License for more details.
-++ *
-++ * You should have received a copy of the GNU Lesser General Public
-++ * License along with FFmpeg; if not, write to the Free Software
-++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-++ */
-++
-++#ifndef AVCODEC_ARM_HEVC_CABAC_H
-++#define AVCODEC_ARM_HEVC_CABAC_H
-++
-++#include "config.h"
-++#if HAVE_ARMV6T2_INLINE
-++
-++#define hevc_mem_bits32 hevc_mem_bits32_arm
-++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
-++{
-++    unsigned int n;
-++    __asm__ (
-++        "rev        %[n], %[x]                     \n\t"
-++        : [n]"=r"(n)
-++        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
-++        :
-++        );
-++    return n << (bits & 7);
-++}
-++
-++
-++// ---------------------------------------------------------------------------
-++//
-++// Helper fns - little bits of code where ARM has an instraction that the
-++// compiler doesn't know about / use
-++
-++#define trans_scale_sat trans_scale_sat_arm
-++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-++{
-++    int rv;
-++    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
-++
-++    __asm__ (
-++    "ssat %[rv], #16, %[t], ASR #1 \n\t"
-++    : [rv]"=r"(rv)
-++    : [t]"r"(t)
-++    :
-++    );
-++    return rv;
-++}
-++
-++#define update_rice update_rice_arm
-++static inline void update_rice_arm(uint8_t * const stat_coeff,
-++    const unsigned int last_coeff_abs_level_remaining,
-++    const unsigned int c_rice_param)
-++{
-++    int t;
-++    __asm__ (
-++    "lsl   %[t], %[coeff], #1               \n\t"
-++    "lsrs  %[t], %[t], %[shift]             \n\t"
-++    "it    eq                               \n\t"
-++    "subeq %[stat], %[stat], #1             \n\t"
-++    "cmp   %[t], #6                         \n\t"
-++    "adc   %[stat], %[stat], #0             \n\t"
-++    "usat  %[stat], #8, %[stat]             \n\t"
-++    : [stat]"+&r"(*stat_coeff),
-++         [t]"=&r"(t)
-++    :  [coeff]"r"(last_coeff_abs_level_remaining),
-++       [shift]"r"(c_rice_param)
-++    : "cc"
-++    );
-++}
-++
-++// ---------------------------------------------------------------------------
-++//
-++// CABAC get loops
-++//
-++// Where the loop is simple enough we can normally do 10-30% better than the
-++// compiler
-++
-++// Get the residual greater than 1 bits
-++
-++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
-++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
-++    uint8_t * const state0)
-++{
-++    unsigned int i, reg_b, st, tmp, bit, rv;
-++     __asm__ (
-++         "mov        %[i]          , #0                          \n\t"
-++         "mov        %[rv]         , #0                          \n\t"
-++         "1:                                                     \n\t"
-++         "add        %[i]          , %[i]        , #1            \n\t"
-++         "cmp        %[rv]         , #0                          \n\t"
-++         "ite        eq                                          \n\t"
-++         "usateq     %[st]         , #2          , %[i]          \n\t"
-++         "movne      %[st]         , #0                          \n\t"
-++
-++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-++         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-++         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
-++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++         "cmp        %[low]        , %[range], lsl #17           \n\t"
-++         "ittt       ge                                          \n\t"
-++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++         "mvnge      %[bit]        , %[bit]                      \n\t"
-++         "movge      %[range]      , %[tmp]                      \n\t"
-++
-++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-++         "and        %[bit]        , %[bit]      , #1            \n\t"
-++         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
-++
-++         "clz        %[tmp]        , %[range]                    \n\t"
-++         "sub        %[tmp]        , #23                         \n\t"
-++
-++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-++// There is a small speed gain from combining both conditions, using a single
-++// branch and then working out what that meant later
-++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++         "it         ne                                          \n\t"
-++         "cmpne      %[n]          , %[i]                        \n\t"
-++         "bne        1b                                          \n\t"
-++
-++// If reload is not required then we must have run out of flags to decode
-++         "tst        %[tmp]        , %[tmp]                      \n\t"
-++         "bne        2f                                          \n\t"
-++
-++// Do reload
-++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-++         "movw       %[r_b]        , #0xFFFF                     \n\t"
-++         "rev        %[tmp]        , %[tmp]                      \n\t"
-++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-++
-++         "rbit       %[r_b]        , %[low]                      \n\t"
-++         "clz        %[r_b]        , %[r_b]                      \n\t"
-++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-++
-++#if CONFIG_THUMB
-++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-++#else
-++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-++#endif
-++
-++         "cmp        %[n]          , %[i]                        \n\t"
-++         "bne        1b                                          \n\t"
-++         "2:                                                     \n\t"
-++         :    [bit]"=&r"(bit),
-++              [low]"+&r"(c->low),
-++            [range]"+&r"(c->range),
-++              [r_b]"=&r"(reg_b),
-++             [bptr]"+&r"(c->bytestream),
-++                [i]"=&r"(i),
-++              [tmp]"=&r"(tmp),
-++               [st]"=&r"(st),
-++               [rv]"=&r"(rv)
-++          :  [state0]"r"(state0),
-++                  [n]"r"(n),
-++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-++               [byte]"M"(offsetof(CABACContext, bytestream)),
-++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-++         : "memory", "cc"
-++    );
-++    return rv;
-++}
-++
-++
-++// n must be > 0 on entry
-++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
-++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
-++    unsigned int n,
-++    const uint8_t const * ctx_map,
-++    uint8_t * p)
-++{
-++    unsigned int reg_b, tmp, st, bit;
-++     __asm__ (
-++         "1:                                                     \n\t"
-++// Get bin from map
-++         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
-++
-++// Load state & ranges
-++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-++         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
-++         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
-++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++         "cmp        %[low]        , %[range], lsl #17           \n\t"
-++         "ittt       ge                                          \n\t"
-++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++         "mvnge      %[bit]        , %[bit]                      \n\t"
-++         "movge      %[range]      , %[tmp]                      \n\t"
-++
-++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-++         "tst        %[bit]        , #1                          \n\t"
-++// GCC asm seems to need strbne written differently for thumb and arm
-++#if CONFIG_THUMB
-++         "it         ne                                          \n\t"
-++         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
-++#else
-++         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
-++#endif
-++
-++// Renorm
-++         "clz        %[tmp]        , %[range]                    \n\t"
-++         "sub        %[tmp]        , #23                         \n\t"
-++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-++// There is a small speed gain from combining both conditions, using a single
-++// branch and then working out what that meant later
-++         "subs       %[n]          , %[n]        , #1            \n\t"
-++#if CONFIG_THUMB
-++         "itt        ne                                          \n\t"
-++         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
-++         "bne        1b                                          \n\t"
-++#else
-++         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
-++         "bne        1b                                          \n\t"
-++#endif
-++
-++// If we have bits left then n must be 0 so give up now
-++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++         "bne        2f                                          \n\t"
-++
-++// Do reload
-++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-++         "movw       %[r_b]        , #0xFFFF                     \n\t"
-++         "rev        %[tmp]        , %[tmp]                      \n\t"
-++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-++
-++         "rbit       %[r_b]        , %[low]                      \n\t"
-++         "clz        %[r_b]        , %[r_b]                      \n\t"
-++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-++
-++#if CONFIG_THUMB
-++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-++#else
-++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-++#endif
-++
-++// Check to see if we still have more to do
-++         "cmp        %[n]          , #0                          \n\t"
-++         "bne        1b                                          \n\t"
-++         "2:                                                     \n\t"
-++         :    [bit]"=&r"(bit),
-++              [low]"+&r"(c->low),
-++            [range]"+&r"(c->range),
-++              [r_b]"=&r"(reg_b),
-++             [bptr]"+&r"(c->bytestream),
-++              [idx]"+&r"(p),
-++                [n]"+&r"(n),
-++              [tmp]"=&r"(tmp),
-++               [st]"=&r"(st)
-++          :  [state0]"r"(state0),
-++            [ctx_map]"r"(ctx_map),
-++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-++               [byte]"M"(offsetof(CABACContext, bytestream)),
-++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-++         : "memory", "cc"
-++    );
-++
-++    return p;
-++}
-++
-++// ---------------------------------------------------------------------------
-++//
-++// CABAC_BY22 functions
-++//
-++// By and large these are (at best) no faster than their C equivalents - the
-++// only one worth having is _peek where we do a slightly better job than the
-++// compiler
-++//
-++// The others have been stashed here for reference in case larger scale asm
-++// is attempted in which case they might be a useful base
-++
-++
-++#define get_cabac_by22_peek get_cabac_by22_peek_arm
-++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
-++{
-++    uint32_t rv, tmp;
-++    __asm__ (
-++        "bic      %[rv]  , %[low], #1            \n\t"
-++        "cmp      %[inv] , #0                    \n\t"
-++        "it       ne                             \n\t"
-++        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
-++        :  // Outputs
-++             [rv]"=&r"(rv),
-++             [tmp]"=r"(tmp)
-++        :  // Inputs
-++             [low]"r"(c->low),
-++             [inv]"r"(c->range)
-++        :  // Clobbers
-++                "cc"
-++    );
-++    return rv << 1;
-++}
-++
-++#if 0
-++
-++// ***** Slower than the C  :-(
-++#define get_cabac_by22_flush get_cabac_by22_flush_arm
-++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
-++{
-++    uint32_t m, tmp;
-++    __asm__ (
-++    "add    %[bits], %[bits], %[n]   \n\t"
-++    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
-++
-++    "rsb    %[tmp], %[n], #32        \n\t"
-++    "lsr    %[tmp], %[val], %[tmp]   \n\t"
-++    "mul    %[tmp], %[range], %[tmp] \n\t"
-++
-++    "rev    %[m], %[m]               \n\t"
-++
-++    "lsl    %[tmp], %[tmp], #23      \n\t"
-++    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-++
-++    "and    %[tmp], %[bits], #7         \n\t"
-++    "lsl    %[m], %[m], %[tmp]          \n\t"
-++
-++    "orr    %[low], %[low], %[m], lsr #9      \n\t"
-++        :  // Outputs
-++             [m]"=&r"(m),
-++           [tmp]"=&r"(tmp),
-++          [bits]"+&r"(c->by22.bits),
-++           [low]"+&r"(c->low)
-++        :  // Inputs
-++               [n]"r"(n),
-++             [val]"r"(val),
-++             [inv]"r"(c->range),
-++           [range]"r"(c->by22.range),
-++             [ptr]"r"(c->bytestream)
-++        :  // Clobbers
-++    );
-++}
-++
-++
-++// Works but slower than C
-++#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
-++static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
-++{
-++    uint32_t n, val, tmp, level;
-++
-++//    PROFILE_START();
-++
-++    __asm__ (
-++            // Peek
-++            "bic    %[val],  %[low],   #1  \n\t"
-++            "cmp    %[inv], #0          \n\t"
-++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
-++            "lsl    %[val], %[val], #1  \n\t"
-++
-++            // Count bits (n = prefix)
-++            "mvn    %[n], %[val] \n\t"
-++            "clz    %[n], %[n]   \n\t"
-++
-++            "lsl    %[level], %[val], %[n] \n\t"
-++            "subs   %[tmp], %[n], #3 \n\t"
-++            "blo    2f \n\t"
-++
-++            // prefix >= 3
-++            // < tmp = prefix - 3
-++            // > tmp = prefix + rice - 3
-++            "add    %[tmp], %[tmp], %[rice] \n\t"
-++            // > n = prefix * 2 + rice - 3
-++            "add    %[n], %[tmp], %[n] \n\t"
-++            "cmp    %[n], #21 \n\t"
-++            "bhi    3f \n\t"
-++
-++            "orr    %[level], %[level], #0x80000000 \n\t"
-++            "rsb    %[tmp], %[tmp], #31 \n\t"
-++            "lsr    %[level], %[level], %[tmp] \n\t"
-++
-++            "mov    %[tmp], #2 \n\t"
-++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
-++            "b      1f \n\t"
-++
-++            // > 22 bits used in total - need reload
-++            "3:  \n\t"
-++
-++            // Stash prefix + rice - 3 in level (only spare reg)
-++            "mov    %[level], %[tmp] \n\t"
-++            // Restore n to flush value (prefix)
-++            "sub    %[n], %[n], %[tmp] \n\t"
-++
-++            // Flush + reload
-++
-++//          "rsb    %[tmp], %[n], #32        \n\t"
-++//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
-++//          "mul    %[tmp], %[range], %[tmp] \n\t"
-++
-++            // As it happens we know that all the bits we are flushing are 1
-++            // so we can cheat slightly
-++            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
-++            "lsl    %[tmp], %[tmp], #23      \n\t"
-++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-++
-++            "add    %[bits], %[bits], %[n]   \n\t"
-++            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
-++            "rev    %[n], %[n]               \n\t"
-++            "and    %[tmp], %[bits], #7         \n\t"
-++            "lsl    %[n], %[n], %[tmp]          \n\t"
-++
-++            "orr    %[low], %[low], %[n], lsr #9      \n\t"
-++
-++            // (reload)
-++
-++            "bic    %[val],  %[low],   #1  \n\t"
-++            "cmp    %[inv], #0          \n\t"
-++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
-++            "lsl    %[val], %[val], #1  \n\t"
-++
-++            // Build value
-++
-++            "mov    %[n], %[level] \n\t"
-++
-++            "orr     %[tmp], %[val], #0x80000000 \n\t"
-++            "rsb     %[level], %[level], #31 \n\t"
-++            "lsr     %[level], %[tmp], %[level] \n\t"
-++
-++            "mov    %[tmp], #2 \n\t"
-++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
-++            "b      1f \n\t"
-++
-++            // prefix < 3
-++            "2:  \n\t"
-++            "rsb    %[tmp], %[rice], #31 \n\t"
-++            "lsr    %[level], %[level], %[tmp] \n\t"
-++            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
-++            "add    %[n], %[n], %[rice] \n\t"
-++
-++            "1:  \n\t"
-++            // Flush
-++            "add    %[n], %[n], #1 \n\t"
-++
-++            "rsb    %[tmp], %[n], #32        \n\t"
-++            "lsr    %[tmp], %[val], %[tmp]   \n\t"
-++
-++            "add    %[bits], %[bits], %[n]   \n\t"
-++            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
-++
-++            "mul    %[tmp], %[range], %[tmp] \n\t"
-++            "lsl    %[tmp], %[tmp], #23      \n\t"
-++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-++
-++            "rev    %[val], %[val]               \n\t"
-++            "and    %[tmp], %[bits], #7         \n\t"
-++            "lsl    %[val], %[val], %[tmp]          \n\t"
-++
-++            "orr    %[low], %[low], %[val], lsr #9      \n\t"
-++        :  // Outputs
-++         [level]"=&r"(level),
-++             [n]"=&r"(n),
-++           [val]"=&r"(val),
-++           [tmp]"=&r"(tmp),
-++          [bits]"+&r"(c->by22.bits),
-++           [low]"+&r"(c->low)
-++        :  // Inputs
-++            [rice]"r"(c_rice_param),
-++             [inv]"r"(c->range),
-++           [range]"r"(c->by22.range),
-++             [ptr]"r"(c->bytestream)
-++        :  // Clobbers
-++                "cc"
-++    );
-++
-++//    PROFILE_ACC(residual_abs);
-++
-++    return level;
-++}
-++#endif
-++
-++#endif /* HAVE_ARMV6T2_INLINE */
-++
-++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-+diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-+index bad4589..a088cc3 100644
-+--- a/libavcodec/arm/hevcdsp_deblock_neon.S
-++++ b/libavcodec/arm/hevcdsp_deblock_neon.S
-+@@ -409,10 +409,12 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+         beq         90f
-+ 
-+         tst         a3, #1
-++        itee        ne
-+         ldrne       a3, [v5, #0]    @ curr->mv[0]
-+         ldreq       a3, [v5, #4]    @ curr->mv[1]
-+         moveq       v1, v2
-+         tst         v8, #1
-++        itee        ne
-+         ldrne       v8, [v6, #0]    @ neigh->mv[0]
-+         ldreq       v8, [v6, #4]    @ neigh->mv[1]
-+         moveq       v3, v4
-+@@ -424,9 +426,14 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+         sel         a3, a3, ip
-+         ands        a3, a3, lr
-+         @ drop through
-+-10:     movne       a3, #1
-++10:     it          ne
-++        movne       a3, #1
-+ 11:     subs        a2, a2, #1
-+-12:     strbhs      a3, [v7], a4
-++12:
-++A       strbhs      a3, [v7], a4
-++T       itt         hs
-++T       strbhs      a3, [v7]
-++T       addhs       v7, v7, a4
-+         subs        a2, a2, #1
-+         bhs         12b
-+ 
-+@@ -442,6 +449,7 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+         bne         10b
-+ 
-+         teq         v1, v3
-++        it          eq
-+         teqeq       v2, v4
-+         bne         40f
-+         teq         v1, v2
-+@@ -487,6 +495,7 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+         b           10b
-+ 
-+ 40:     teq         v1, v4
-++        ite         eq
-+         teqeq       v2, v3
-+         bne         10b
-+ 
-+diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
-+index 516ae5b..00eab9e 100644
-+--- a/libavcodec/arm/hevcdsp_epel_neon.S
-++++ b/libavcodec/arm/hevcdsp_epel_neon.S
-+@@ -110,7 +110,9 @@ function ff_hevc_put_epel_h_neon_8, export=1
-+         sub    r7, #1
-+         lsl    r7, #2
-+         vpush {d8-d15}
-+-        adrl   r12, epel_coeffs
-++@ adr reaches if we are in thumb mode but not in arm
-++T       adr    r12, epel_coeffs
-++A       adrl   r12, epel_coeffs
-+         add    r7, r12
-+         sub       r1, #1
-+         lsl       r4, #1
-+@@ -170,7 +172,8 @@ function ff_hevc_put_epel_v_neon_8, export=1
-+         sub    r7, #1
-+         lsl    r7, #2
-+         vpush {d8-d15}
-+-        adrl   r12, epel_coeffs
-++T       adr    r12, epel_coeffs
-++A       adrl   r12, epel_coeffs
-+         add    r7, r12
-+         load_coeffs_16b r7
-+         sub       r1, r2
-+@@ -246,7 +249,7 @@ function ff_hevc_put_epel_hv_neon_8, export=1
-+         sub    r7, #1
-+         lsl    r7, #2
-+         vpush {d8-d15}
-+-        adrl   r12, epel_coeffs
-++        adr    r12, epel_coeffs
-+         sub    r6, #1
-+         lsl    r6, #2
-+         add    r6, r12 // mx epel coeff offset
-+diff --git a/libavcodec/cabac.c b/libavcodec/cabac.c
-+index f298336..91f5ef5 100644
-+--- a/libavcodec/cabac.c
-++++ b/libavcodec/cabac.c
-+@@ -59,10 +59,19 @@ int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
-+ #if CABAC_BITS == 16
-+     c->low =  (*c->bytestream++)<<18;
-+     c->low+=  (*c->bytestream++)<<10;
-++    // Keep our fetches on a 2-byte boundry as this should avoid ever having to
-++    // do unaligned loads if the compiler (or asm) optimises the double byte
-++    // load into a single instruction
-++    if(((uintptr_t)c->bytestream & 1) == 0) {
-++        c->low += (1 << 9);
-++    }
-++    else {
-++        c->low += ((*c->bytestream++) << 2) + 2;
-++    }
-+ #else
-+     c->low =  (*c->bytestream++)<<10;
-+-#endif
-+     c->low+= ((*c->bytestream++)<<2) + 2;
-++#endif
-+     c->range= 0x1FE;
-+     if ((c->range<<(CABAC_BITS+1)) < c->low)
-+         return AVERROR_INVALIDDATA;
-+diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-+index 857211c..857a1de 100644
-+--- a/libavcodec/cabac.h
-++++ b/libavcodec/cabac.h
-+@@ -48,7 +48,14 @@ extern CABAC_TABLE_CONST uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
-+ typedef struct CABACContext{
-+     int low;
-+     int range;
-+-    int outstanding_count;
-++    union
-++    {
-++        int outstanding_count;
-++        struct {
-++            uint16_t bits;
-++            uint16_t range;
-++        } by22;
-++    };
-+     const uint8_t *bytestream_start;
-+     const uint8_t *bytestream;
-+     const uint8_t *bytestream_end;
-+diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
-+index 2d1d2a6..d3518cb 100644
-+--- a/libavcodec/cabac_functions.h
-++++ b/libavcodec/cabac_functions.h
-+@@ -51,6 +51,7 @@ static CABAC_TABLE_CONST uint8_t * const ff_h264_lps_range = ff_h264_cabac_table
-+ static CABAC_TABLE_CONST uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET;
-+ static CABAC_TABLE_CONST uint8_t * const ff_h264_last_coeff_flag_offset_8x8 = ff_h264_cabac_tables + H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET;
-+ 
-++#if !defined(get_cabac_bypass) || !defined(get_cabac_terminate)
-+ static void refill(CABACContext *c){
-+ #if CABAC_BITS == 16
-+         c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
-+@@ -63,7 +64,9 @@ static void refill(CABACContext *c){
-+ #endif
-+         c->bytestream += CABAC_BITS / 8;
-+ }
-++#endif
-+ 
-++#ifndef get_cabac_terminate
-+ static inline void renorm_cabac_decoder_once(CABACContext *c){
-+     int shift= (uint32_t)(c->range - 0x100)>>31;
-+     c->range<<= shift;
-+@@ -71,14 +74,18 @@ static inline void renorm_cabac_decoder_once(CABACContext *c){
-+     if(!(c->low & CABAC_MASK))
-+         refill(c);
-+ }
-++#endif
-+ 
-+ #ifndef get_cabac_inline
-+ static void refill2(CABACContext *c){
-+     int i;
-+     unsigned x;
-+-
-++#if !HAVE_FAST_CLZ
-+     x= c->low ^ (c->low-1);
-+     i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
-++#else
-++    i = ff_ctz(c->low) - CABAC_BITS;
-++#endif
-+ 
-+     x= -CABAC_MASK;
-+ 
-+@@ -94,7 +101,9 @@ static void refill2(CABACContext *c){
-+ #endif
-+         c->bytestream += CABAC_BITS/8;
-+ }
-++#endif
-+ 
-++#ifndef get_cabac_inline
-+ static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
-+     int s = *state;
-+     int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
-+@@ -166,6 +175,7 @@ static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
-+  *
-+  * @return the number of bytes read or 0 if no end
-+  */
-++#ifndef get_cabac_terminate
-+ static int av_unused get_cabac_terminate(CABACContext *c){
-+     c->range -= 2;
-+     if(c->low < c->range<<(CABAC_BITS+1)){
-+@@ -175,11 +185,13 @@ static int av_unused get_cabac_terminate(CABACContext *c){
-+         return c->bytestream - c->bytestream_start;
-+     }
-+ }
-++#endif
-+ 
-+ /**
-+  * Skip @p n bytes and reset the decoder.
-+  * @return the address of the first skipped byte or NULL if there's less than @p n bytes left
-+  */
-++#ifndef skip_bytes
-+ static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
-+     const uint8_t *ptr = c->bytestream;
-+ 
-+@@ -196,5 +208,6 @@ static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
-+ 
-+     return ptr;
-+ }
-++#endif
-+ 
-+ #endif /* AVCODEC_CABAC_FUNCTIONS_H */
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index 271e17a..4caf720 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -21,14 +21,72 @@
-+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+  */
-+ 
-++#define UNCHECKED_BITSTREAM_READER 1
-++
-+ #include "libavutil/attributes.h"
-+ #include "libavutil/common.h"
-+ 
-+-#include "cabac_functions.h"
-+ #include "hevc.h"
-++#include "cabac_functions.h"
-++
-++// BY22 is probably faster than simple bypass if the processor has
-++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
-++// x86 has fast int divide
-++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
-++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
-++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
-++// Use native divide if we have a fast one - otherwise use mpy 1/x
-++// x86 has a fast integer divide - arm doesn't - unsure about other
-++// architectures
-++#define USE_BY22_DIV  ARCH_X86
-++
-++// Special case blocks with a single significant ceoff
-++// Decreases the complexity of the code for a common case but increases the
-++// code size.
-++#define USE_N_END_1 1
-++
-++#if ARCH_ARM
-++#include "arm/hevc_cabac.h"
-++#endif
-+ 
-+ #define CABAC_MAX_BIN 31
-+ 
-++
-++#if USE_BY22 && !USE_BY22_DIV
-++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
-++
-++static const uint32_t cabac_by22_inv_range[256] = {
-++                                                    0,      I(257), I(258), I(259),
-++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
-++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
-++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
-++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
-++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
-++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
-++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
-++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
-++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
-++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
-++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
-++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
-++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
-++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
-++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
-++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
-++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
-++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
-++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
-++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
-++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
-++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
-++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
-++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
-++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
-++    I(510), I(511)
-++};
-++#undef I
-++#endif  // USE_BY22
-++
-+ /**
-+  * number of bin by SyntaxElement.
-+  */
-+@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
-+     { 28, 36, 43, 49, 54, 58, 61, 63, },
-+ };
-+ 
-++
-++typedef struct
-++{
-++    uint16_t coeff;
-++    uint16_t scale;
-++} xy_off_t;
-++
-++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
-++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
-++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
-++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
-++
-++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
-++
-++#define OFF_DIAG(t) {\
-++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
-++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
-++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
-++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
-++}
-++
-++#define OFF_HORIZ(t) {\
-++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
-++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
-++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
-++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
-++}
-++
-++#define OFF_VERT(t) {\
-++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
-++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
-++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
-++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
-++}
-++
-++static const xy_off_t off_xys[3][4][16] =
-++{
-++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
-++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
-++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
-++};
-++
-++
-++// Helper fns
-++#ifndef hevc_mem_bits32
-++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
-++{
-++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
-++}
-++#endif
-++
-++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
-++#define hevc_clz32 hevc_clz32_builtin
-++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
-++{
-++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
-++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
-++}
-++#endif
-++
-++// It is unlikely that we will ever need this but include for completeness
-++#ifndef hevc_clz32
-++static inline unsigned int hevc_clz32(unsigned int x)
-++{
-++    unsigned int n = 1;
-++    if ((x & 0xffff0000) == 0) {
-++        n += 16;
-++        x <<= 16;
-++    }
-++    if ((x & 0xff000000) == 0) {
-++        n += 8;
-++        x <<= 8;
-++    }
-++    if ((x & 0xf0000000) == 0) {
-++        n += 4;
-++        x <<= 4;
-++    }
-++    if ((x & 0xc0000000) == 0) {
-++        n += 2;
-++        x <<= 2;
-++    }
-++    return n - ((x >> 31) & 1);
-++}
-++#endif
-++
-++
-++#if !USE_BY22
-++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
-++// will no longer be called but the setup calls will still exist and we want
-++// to null them out
-++#define bypass_start(s)
-++#define bypass_finish(s)
-++#else
-++// Use BY22 for residual bypass block
-++
-++#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
-++#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
-++
-++// BY22 notes that bypass is simply a divide into the bitstream and so we
-++// can peek out large quantities of bits at one and treat the result as if
-++// it was VLC.  In many cases this will lead to O(1) processing rather than
-++// O(n) though the setup and teardown is sufficiently expensive that it is
-++// only worth using if we expect to be dealing with more than a few bits
-++// The definition of "a few bits" will vary from platform to platform but
-++// tests on ARM show that it probably isn't worth it for a single coded
-++// residual, but is for >1 - this is probaly reinforced that if there are
-++// more residuals then they are likely to be bigger and this will make the
-++// O(1) nature of the code more worthwhile.
-++
-++
-++#if !USE_BY22_DIV
-++// * 1/x @ 32 bits gets us 22 bits of accuracy
-++#define CABAC_BY22_PEEK_BITS  22
-++#else
-++// A real 32-bit divide gets us another bit
-++// If we have a 64 bit int & a unit time divider then we should get a lot
-++// of bits (55)  but that is untested and it is unclear if it would give
-++// us a large advantage
-++#define CABAC_BY22_PEEK_BITS  23
-++#endif
-++
-++// Bypass block start
-++// Must be called before _by22_peek is used as it sets the CABAC environment
-++// into the correct state.  _by22_finish must be called to return to 'normal'
-++// (i.e. non-bypass) cabac decoding
-++static inline void get_cabac_by22_start(CABACContext * const c)
-++{
-++    const unsigned int bits = __builtin_ctz(c->low);
-++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
-++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
-++#if !USE_BY22_DIV
-++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
-++#endif
-++
-++    c->bytestream -= (CABAC_BITS / 8);
-++    c->by22.bits = bits;
-++#if !USE_BY22_DIV
-++    c->by22.range = c->range;
-++    c->range = inv;
-++#endif
-++    c->low = x;
-++}
-++
-++// Bypass block finish
-++// Must be called at the end of the bypass block to return to normal operation
-++static inline void get_cabac_by22_finish(CABACContext * const c)
-++{
-++    unsigned int used = c->by22.bits;
-++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
-++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
-++
-++    c->bytestream += bytes_used + (CABAC_BITS / 8);
-++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
-++#if !USE_BY22_DIV
-++    c->range = c->by22.range;
-++#endif
-++}
-++
-++// Peek bypass bits
-++// _by22_start must be called before _by22_peek is called and _by22_flush
-++// must be called afterwards to flush any used bits
-++// The actual number of valid bits returned is
-++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
-++// will be at least 22 which should be long enough for any prefix or suffix
-++// though probably not long enough for the worst case combination
-++#ifndef get_cabac_by22_peek
-++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
-++{
-++#if USE_BY22_DIV
-++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
-++#else
-++    uint32_t x = c->low & ~1U;
-++    const uint32_t inv = c->range;
-++
-++    if (inv != 0)
-++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
-++
-++    return x << 1;
-++#endif
-++}
-++#endif
-++
-++// Flush bypass bits peeked by _by22_peek
-++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
-++// val is an unmodified copy of whatever _by22_peek returned
-++#ifndef get_cabac_by22_flush
-++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
-++{
-++    // Subtract the bits used & reshift up to the top of the word
-++#if USE_BY22_DIV
-++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
-++#else
-++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
-++#endif
-++
-++    // and refill lower bits
-++    // We will probably OR over some existing bits but that doesn't matter
-++    c->by22.bits += n;
-++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
-++}
-++#endif
-++
-++#endif  // USE_BY22
-++
-++
-+ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
-+ {
-+     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-+@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
-+     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
-+ }
-+ 
-+-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
-++static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
-+ {
-+-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
-++    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
-+ }
-+ 
-+-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
-++static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
-+ {
-+-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
-++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
-+ }
-+ 
-+-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
-++static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
-+ {
-+-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
-++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
-+ }
-+ 
-+ int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
-+@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
-+     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
-+ }
-+ 
-+-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
-++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
-+                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
-+ {
-+     int i = 0;
-+     int max = (log2_size << 1) - 1;
-+     int ctx_offset, ctx_shift;
-+ 
-+-    if (!c_idx) {
-++    if (!c_idx_nz) {
-+         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
-+         ctx_shift = (log2_size + 1) >> 2;
-+     } else {
-+@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
-+     return value;
-+ }
-+ 
-+-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
-++static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
-+ {
-+     int inc;
-+ 
-+-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
-++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
-+ 
-+     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
-+ }
-+-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
-+-                                           int offset, const uint8_t *ctx_idx_map)
-+-{
-+-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
-+-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
-+-}
-+ 
-+-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
-++static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
-+ {
-+     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
-+ }
-+@@ -966,65 +1223,305 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
-+     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
-+ }
-+ 
-+-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
-++
-++#if !USE_BY22
-++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
-++#endif
-++
-++
-++#ifndef coeff_abs_level_remaining_decode_bypass
-++static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
-++{
-++    CABACContext * const c = &s->HEVClc->cc;
-++    uint32_t y;
-++    unsigned int prefix;
-++    unsigned int last_coeff_abs_level_remaining;
-++    unsigned int n;
-++
-++    y = get_cabac_by22_peek(c);
-++    prefix = hevc_clz32(~y);
-++    // y << prefix will always have top bit 0
-++
-++    if (prefix < 3) {
-++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
-++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
-++        n = prefix + 1 + rice_param;
-++    }
-++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
-++    {
-++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
-++
-++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-++        n = prefix * 2 + rice_param - 2;
-++    }
-++    else {
-++        unsigned int suffix;
-++
-++        get_cabac_by22_flush(c, prefix, y);
-++        y = get_cabac_by22_peek(c);
-++
-++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
-++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-++        n = prefix + rice_param - 2;
-++    }
-++
-++    get_cabac_by22_flush(c, n, y);
-++
-++    return last_coeff_abs_level_remaining;
-++}
-++#endif
-++
-++static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
-+ {
-++    CABACContext * const c = &s->HEVClc->cc;
-+     int prefix = 0;
-+     int suffix = 0;
-+     int last_coeff_abs_level_remaining;
-+     int i;
-+ 
-+-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
-++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
-+         prefix++;
-+     if (prefix == CABAC_MAX_BIN) {
-+         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
-+         return 0;
-+     }
-++
-+     if (prefix < 3) {
-+         for (i = 0; i < rc_rice_param; i++)
-+-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
-++            suffix = (suffix << 1) | get_cabac_bypass(c);
-+         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
-+     } else {
-+         int prefix_minus3 = prefix - 3;
-+         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-+-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
-++            suffix = (suffix << 1) | get_cabac_bypass(c);
-+         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
-+                                               << rc_rice_param) + suffix;
-+     }
-++
-+     return last_coeff_abs_level_remaining;
-+ }
-+ 
-+-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
-++#if !USE_BY22
-++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
-++static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
-+ {
-+-    int i;
-+-    int ret = 0;
-++    CABACContext * const c = &s->HEVClc->cc;
-++    unsigned int i;
-++    uint32_t ret = 0;
-+ 
-+     for (i = 0; i < nb; i++)
-+-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
-+-    return ret;
-++        ret = (ret << 1) | get_cabac_bypass(c);
-++
-++    return ret << (32 - nb);
-+ }
-++#endif
-++
-++#ifndef coeff_sign_flag_decode_bypass
-++static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
-++{
-++    CABACContext * const c = &s->HEVClc->cc;
-++    uint32_t y;
-++    y = get_cabac_by22_peek(c);
-++    get_cabac_by22_flush(c, nb, y);
-++    return y & ~(0xffffffffU >> nb);
-++}
-++#endif
-++
-++
-++#ifndef get_cabac_greater1_bits
-++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
-++    uint8_t * const state0)
-++{
-++    unsigned int i;
-++    unsigned int rv = 0;
-++    for (i = 0; i != n; ++i) {
-++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
-++        const unsigned int b = get_cabac(c, state0 + idx);
-++        rv = (rv << 1) | b;
-++    }
-++    return rv;
-++}
-++#endif
-++
-++
-++// N.B. levels returned are the values assuming coeff_abs_level_remaining
-++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
-++// this version of events.
-++static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
-++    int * const pprev_subset_coded, int * const psum,
-++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
-++{
-++    CABACContext * const c = &s->HEVClc->cc;
-++    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
-++    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
-++    unsigned int rv;
-++    unsigned int i;
-++    const unsigned int n = FFMIN(n_end, 8);
-++
-++    // Really this is i != n but the simple unconditional loop is cheaper
-++    // and faster
-++    for (i = 0; i != 8; ++i)
-++        levels[i] = 1;
-++
-++    rv = get_cabac_greater1_bits(c, n, state0);
-++
-++    *pprev_subset_coded = 0;
-++    *psum = n;
-++
-++    rv <<= (32 - n);
-++    if (rv != 0)
-++    {
-++        *pprev_subset_coded = 1;
-++        *psum = n + 1;
-++        i = hevc_clz32(rv);
-++        levels[i] = 2;
-++        if (get_cabac(c, state_gt2) == 0)
-++        {
-++            // Unset first coded bit
-++            rv &= ~(0x80000000U >> i);
-++        }
-++    }
-++
-++    if (n_end > 8) {
-++        const unsigned int g8 = n_end - 8;
-++        rv |= ((1 << g8) - 1) << (24 - g8);
-++        for (i = 0; i != g8; ++i) {
-++            levels[i + 8] = 0;
-++        }
-++    }
-++
-++    return rv;
-++}
-++
-++// extended_precision_processing_flag must be false given we are
-++// putting the result into a 16-bit array
-++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
-++// scale_m is uint8_t
-++//
-++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
-++//   or it can be 2 (if we have transquant_bypass)
-++// shift is set to one less than we really want but would normally be
-++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
-++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
-++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
-++// to achieve it
-++
-++#ifndef trans_scale_sat
-++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-++{
-++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
-++}
-++#endif
-++
-++
-++#ifndef update_rice
-++static inline void update_rice(uint8_t * const stat_coeff,
-++    const unsigned int last_coeff_abs_level_remaining,
-++    const unsigned int c_rice_param)
-++{
-++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
-++    if (x >= 6)
-++        (*stat_coeff)++;
-++    else if (x == 0 && *stat_coeff > 0)
-++        (*stat_coeff)--;
-++}
-++#endif
-++
-++
-++// n must be > 0 on entry
-++#ifndef get_cabac_sig_coeff_flag_idxs
-++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-++    unsigned int n,
-++    const uint8_t const * ctx_map,
-++    uint8_t * p)
-++{
-++    do {
-++        if (get_cabac(c, state0 + ctx_map[n]))
-++            *p++ = n;
-++    } while (--n != 0);
-++    return p;
-++}
-++#endif
-++
-++
-++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-++    unsigned int n,
-++    const uint8_t const * ctx_map,
-++    uint8_t * const flag_idx)
-++{
-++    int rv;
-++
-++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
-++
-++    return rv;
-++}
-++
-++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-++     x0,  x1,  x2,  x3,\
-++     x4,  x5,  x6,  x7,\
-++     x8,  x9, x10, x11,\
-++    x12, x13, x14, x15}
-++
-++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-++     x0,  x4,  x8, x12,\
-++     x1,  x5,  x9, x13,\
-++     x2,  x6, x10, x14,\
-++     x3,  x7, x11, x15}
-++
-++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-++     x0,  x4,  x1,  x8,\
-++     x5,  x2, x12,  x9,\
-++     x6,  x3, x13, x10,\
-++     x7, x14, x11, x15}
-++
-++
-++static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
-++    uint8_t * const significant_coeff_group_flag,
-++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
-++    int * const pPrev_sig)
-++{
-++    while (--i >= 0) {
-++        unsigned int x_cg = scan_x_cg[i];
-++        unsigned int y_cg = scan_y_cg[i];
-++
-++        // For the flag decode we only care about Z/NZ but
-++        // we use the full Right + Down * 2 when calculating
-++        // significant coeff flags so we obtain it here
-++        //.
-++        // The group flag array is one longer than it needs to
-++        // be so we don't need to check for y_cg limits
-++        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
-++            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
-++
-++        if (i == 0 ||
-++            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
-++        {
-++            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
-++            *pPrev_sig = prev_sig;
-++            break;
-++        }
-++    }
-++
-++    return i;
-++}
-++
-+ 
-+ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                                 int log2_trafo_size, enum ScanType scan_idx,
-+                                 int c_idx)
-+ {
-+-#define GET_COORD(offset, n)                                    \
-+-    do {                                                        \
-+-        x_c = (x_cg << 2) + scan_x_off[n];                      \
-+-        y_c = (y_cg << 2) + scan_y_off[n];                      \
-+-    } while (0)
-+-    HEVCLocalContext *lc = s->HEVClc;
-+-    int transform_skip_flag = 0;
-++    HEVCLocalContext * const lc = s->HEVClc;
-++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
-+ 
-+     int last_significant_coeff_x, last_significant_coeff_y;
-+-    int last_scan_pos;
-+-    int n_end;
-+     int num_coeff = 0;
-+-    int greater1_ctx = 1;
-++    int prev_subset_coded = 0;
-+ 
-+     int num_last_subset;
-+     int x_cg_last_sig, y_cg_last_sig;
-+ 
-+-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
-++    const uint8_t *scan_x_cg, *scan_y_cg;
-++    const xy_off_t * scan_xy_off;
-+ 
-+     ptrdiff_t stride = s->frame->linesize[c_idx];
-+     int hshift = s->ps.sps->hshift[c_idx];
-+@@ -1032,21 +1529,28 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-+                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-+ #ifdef RPI
-+-    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size>=4;
-++    //***** transform_skip_flag decoded later!
-++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
-+ #endif
-+     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
-++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
-+     int explicit_rdpcm_flag = 0;
-+     int explicit_rdpcm_dir_flag;
-+ 
-+     int trafo_size = 1 << log2_trafo_size;
-+     int i;
-+-    int qp,shift,add,scale,scale_m;
-+-    const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-++    int qp,shift,scale;
-++    static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-+     const uint8_t *scale_matrix = NULL;
-+     uint8_t dc_scale;
-+     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
-+                                          lc->tu.intra_pred_mode_c;
-++
-++    int prev_sig = 0;
-++    const int c_idx_nz = (c_idx != 0);
-++
-++    int may_hide_sign;
-++
-+ #ifdef RPI
-+     if (s->enable_rpi) {
-+         int n = trafo_size * trafo_size;
-+@@ -1078,7 +1582,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+ 
-+     // Derive QP for dequant
-+     if (!lc->cu.cu_transquant_bypass_flag) {
-+-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
-++        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
-+         static const uint8_t rem6[51 + 4 * 6 + 1] = {
-+             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
-+             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-+@@ -1094,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+         };
-+         int qp_y = lc->qp_y;
-+ 
-++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
-++
-+         if (s->ps.pps->transform_skip_enabled_flag &&
-+             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-+-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
-++            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
-++            if (transform_skip_flag) {
-++                trans_skip_or_bypass = 1;
-++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
-++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
-++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
-++                    may_hide_sign = 0;
-++                }
-++            }
-+         }
-+ 
-+         if (c_idx == 0) {
-+@@ -1129,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             qp += s->ps.sps->qp_bd_offset;
-+         }
-+ 
-+-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
-+-        add      = 1 << (shift-1);
-+-        scale    = level_scale[rem6[qp]] << (div6[qp]);
-+-        scale_m  = 16; // default when no custom scaling lists.
-+-        dc_scale = 16;
-++        // Shift is set to one less than will actually occur as the scale
-++        // and saturate step adds 1 and then shifts right again
-++        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
-++        scale = level_scale[rem6[qp]];
-++        if (div6[qp] >= shift) {
-++            scale <<= (div6[qp] - shift);
-++            shift = 0;
-++        } else {
-++            shift -= div6[qp];
-++        }
-+ 
-+-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
-++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
-+             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
-+-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-+             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
-+ 
-+             matrix_id = 3 * matrix_id + c_idx;
-+ 
-+             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-++            dc_scale = scale_matrix[0];
-+             if (log2_trafo_size >= 4)
-+                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-+         }
-++        else
-++        {
-++            static const uint8_t sixteen_scale[64] = {
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16
-++            };
-++            scale_matrix = sixteen_scale;
-++            dc_scale = 16;
-++        }
-+     } else {
-++        static const uint8_t unit_scale[64] = {
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++        };
-++        scale_matrix = unit_scale;
-+         shift        = 0;
-+-        add          = 0;
-+-        scale        = 0;
-+-        dc_scale     = 0;
-++        scale        = 2;  // We will shift right to kill this
-++        dc_scale     = 1;
-++
-++        may_hide_sign = 0;
-+     }
-+ 
-+     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-+-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-+-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
-++        trans_skip_or_bypass) {
-++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
-+         if (explicit_rdpcm_flag) {
-+-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
-++            may_hide_sign = 0;
-++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
-+         }
-+     }
-+ 
-+-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
-++    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
-+                                            &last_significant_coeff_x, &last_significant_coeff_y);
-+ 
-+     if (last_significant_coeff_x > 3) {
-+@@ -1189,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+         int last_x_c = last_significant_coeff_x & 3;
-+         int last_y_c = last_significant_coeff_y & 3;
-+ 
-+-        scan_x_off = ff_hevc_diag_scan4x4_x;
-+-        scan_y_off = ff_hevc_diag_scan4x4_y;
-+         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-+-        if (trafo_size == 4) {
-++
-++        switch (log2_trafo_size) {
-++        case 2:
-+             scan_x_cg = scan_1x1;
-+             scan_y_cg = scan_1x1;
-+-        } else if (trafo_size == 8) {
-++            break;
-++        case 3:
-+             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+             scan_x_cg = diag_scan2x2_x;
-+             scan_y_cg = diag_scan2x2_y;
-+-        } else if (trafo_size == 16) {
-++            break;
-++        case 4:
-+             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+             scan_x_cg = ff_hevc_diag_scan4x4_x;
-+             scan_y_cg = ff_hevc_diag_scan4x4_y;
-+-        } else { // trafo_size == 32
-++            break;
-++        case 5:
-++        default:
-+             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+             scan_x_cg = ff_hevc_diag_scan8x8_x;
-+             scan_y_cg = ff_hevc_diag_scan8x8_y;
-++            break;
-+         }
-+         break;
-+     }
-+     case SCAN_HORIZ:
-+         scan_x_cg = horiz_scan2x2_x;
-+         scan_y_cg = horiz_scan2x2_y;
-+-        scan_x_off = horiz_scan4x4_x;
-+-        scan_y_off = horiz_scan4x4_y;
-+         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-+         break;
-+     default: //SCAN_VERT
-+         scan_x_cg = horiz_scan2x2_y;
-+         scan_y_cg = horiz_scan2x2_x;
-+-        scan_x_off = horiz_scan4x4_y;
-+-        scan_y_off = horiz_scan4x4_x;
-+         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-+         break;
-+     }
-+     num_coeff++;
-+     num_last_subset = (num_coeff - 1) >> 4;
-+ 
-+-    for (i = num_last_subset; i >= 0; i--) {
-+-        int n, m;
-+-        int x_cg, y_cg, x_c, y_c, pos;
-+-        int implicit_non_zero_coeff = 0;
-+-        int64_t trans_coeff_level;
-+-        int prev_sig = 0;
-+-        int offset = i << 4;
-+-        int rice_init = 0;
-+-
-+-        uint8_t significant_coeff_flag_idx[16];
-+-        uint8_t nb_significant_coeff_flag = 0;
-++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+ 
-+-        x_cg = scan_x_cg[i];
-+-        y_cg = scan_y_cg[i];
-++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+ 
-+-        if ((i < num_last_subset) && (i > 0)) {
-+-            int ctx_cg = 0;
-+-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
-+-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
-+-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
-+-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
-+-
-+-            significant_coeff_group_flag[x_cg][y_cg] =
-+-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
-+-            implicit_non_zero_coeff = 1;
-+-        } else {
-+-            significant_coeff_group_flag[x_cg][y_cg] =
-+-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
-+-             (x_cg == 0 && y_cg == 0));
-+-        }
-++    i = num_last_subset;
-++    do {
-++        int implicit_non_zero_coeff = 0;
-++        int n_end;
-+ 
-+-        last_scan_pos = num_coeff - offset - 1;
-++        uint8_t significant_coeff_flag_idx[16];
-++        unsigned int nb_significant_coeff_flag = 0;
-+ 
-+         if (i == num_last_subset) {
-++            // First time through
-++            int last_scan_pos = num_coeff - (i << 4) - 1;
-+             n_end = last_scan_pos - 1;
-+             significant_coeff_flag_idx[0] = last_scan_pos;
-+             nb_significant_coeff_flag = 1;
-+         } else {
-+             n_end = 15;
-++            implicit_non_zero_coeff = (i != 0);
-+         }
-+ 
-+-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
-+-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
-+-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
-+-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
-+-
-+-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
-+-            static const uint8_t ctx_idx_map[] = {
-+-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
-+-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
-+-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
-+-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
-+-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
-++        if (n_end >= 0) {
-++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
-++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
-++            };
-++            static const uint8_t ctx_idx_maps[3][4][16] = {
-++                {
-++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-++                },
-++                {
-++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-++                },
-++                {
-++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-++                }
-+             };
-+             const uint8_t *ctx_idx_map_p;
-+             int scf_offset = 0;
-+-            if (s->ps.sps->transform_skip_context_enabled_flag &&
-+-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-+-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
-+-                if (c_idx == 0) {
-+-                    scf_offset = 40;
-+-                } else {
-+-                    scf_offset = 14 + 27;
-+-                }
-++
-++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-++                ctx_idx_map_p = ctx_idx_maps[0][3];
-++                scf_offset = 40 + c_idx_nz;
-+             } else {
-+-                if (c_idx != 0)
-++                if (c_idx_nz != 0)
-+                     scf_offset = 27;
-++
-+                 if (log2_trafo_size == 2) {
-+-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
-++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
-+                 } else {
-+-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
-+-                    if (c_idx == 0) {
-+-                        if ((x_cg > 0 || y_cg > 0))
-++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
-++                    if (!c_idx_nz) {
-++                        if (i != 0)
-+                             scf_offset += 3;
-++
-+                         if (log2_trafo_size == 3) {
-+                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
-+                         } else {
-+@@ -1315,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                     }
-+                 }
-+             }
-+-            for (n = n_end; n > 0; n--) {
-+-                x_c = scan_x_off[n];
-+-                y_c = scan_y_off[n];
-+-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
-+-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-+-                    nb_significant_coeff_flag++;
-++
-++            if (n_end > 0) {
-++                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
-++                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
-++                    n_end, ctx_idx_map_p,
-++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
-++
-++                nb_significant_coeff_flag += cnt;
-++                if (cnt != 0) {
-+                     implicit_non_zero_coeff = 0;
-+                 }
-+             }
-++
-+             if (implicit_non_zero_coeff == 0) {
-+-                if (s->ps.sps->transform_skip_context_enabled_flag &&
-+-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-+-                    if (c_idx == 0) {
-+-                        scf_offset = 42;
-+-                    } else {
-+-                        scf_offset = 16 + 27;
-+-                    }
-++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-++                    scf_offset = 42 + c_idx_nz;
-+                 } else {
-+                     if (i == 0) {
-+-                        if (c_idx == 0)
-+-                            scf_offset = 0;
-+-                        else
-+-                            scf_offset = 27;
-++                        scf_offset = c_idx_nz ? 27 : 0;
-+                     } else {
-+                         scf_offset = 2 + scf_offset;
-+                     }
-+                 }
-+-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
-++                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
-+                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+                     nb_significant_coeff_flag++;
-+                 }
-+@@ -1352,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             }
-+         }
-+ 
-+-        n_end = nb_significant_coeff_flag;
-+-
-++        if (nb_significant_coeff_flag != 0) {
-++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
-++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
-++                prev_subset_coded;
-++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
-++                (gt1_idx_delta << 2);
-++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
-++                gt1_idx_delta;
-++
-++            const unsigned int x_cg = scan_x_cg[i];
-++            const unsigned int y_cg = scan_y_cg[i];
-++            int16_t * const blk_coeffs = coeffs +
-++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
-++            // This calculation is 'wrong' for log2_traffo_size == 2
-++            // but that doesn't mattor as in this case x_cg & y_cg
-++            // are always 0 so result is correct (0) anyway
-++            const uint8_t * const blk_scale = scale_matrix +
-++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
-++
-++            // * THe following code block doesn't deal with these flags:
-++            //   (nor did the one it replaces)
-++            //
-++            // cabac_bypass_alignment_enabled_flag
-++            //    This should be easy but I can't find a test case
-++            // extended_precision_processing_flag
-++            //    This can extend the required precision past 16bits
-++            //    so is probably tricky - also no example found yet
-++
-++#if USE_N_END_1
-++            if (nb_significant_coeff_flag == 1) {
-++                // There is a small gain to be had from special casing the single
-++                // transform coefficient case.  The reduction in complexity
-++                // makes up for the code duplicatioon.
-++
-++                int trans_coeff_level = 1;
-++                int coeff_sign_flag;
-++                int coded_val = 0;
-++
-++                // initialize first elem of coeff_bas_level_greater1_flag
-++                prev_subset_coded = 0;
-++
-++                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
-++                    trans_coeff_level = 2;
-++                    prev_subset_coded = 1;
-++                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
-++                }
-+ 
-+-        if (n_end) {
-+-            int first_nz_pos_in_cg;
-+-            int last_nz_pos_in_cg;
-+-            int c_rice_param = 0;
-+-            int first_greater1_coeff_idx = -1;
-+-            uint8_t coeff_abs_level_greater1_flag[8];
-+-            uint16_t coeff_sign_flag;
-+-            int sum_abs = 0;
-+-            int sign_hidden;
-+-            int sb_type;
-++                // Probably not worth the overhead of starting by22 for just one value
-++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
-+ 
-++                if (coded_val)
-++                {
-++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
-++                    } else {
-++                        uint8_t * const stat_coeff =
-++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-++                        const unsigned int c_rice_param = *stat_coeff >> 2;
-++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-+ 
-+-            // initialize first elem of coeff_bas_level_greater1_flag
-+-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
-++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-++                    }
-++                }
-+ 
-+-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-+-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
-+-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
-+-                else
-+-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
-+-                c_rice_param = lc->stat_coeff[sb_type] / 4;
-+-            }
-++                {
-++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-++                    const unsigned int scale_m = blk_scale[xy_off->scale];
-+ 
-+-            if (!(i == num_last_subset) && greater1_ctx == 0)
-+-                ctx_set++;
-+-            greater1_ctx = 1;
-+-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-+-
-+-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
-+-                int inc = (ctx_set << 2) + greater1_ctx;
-+-                coeff_abs_level_greater1_flag[m] =
-+-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
-+-                if (coeff_abs_level_greater1_flag[m]) {
-+-                    greater1_ctx = 0;
-+-                    if (first_greater1_coeff_idx == -1)
-+-                        first_greater1_coeff_idx = m;
-+-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
-+-                    greater1_ctx++;
-++                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
-++                        (trans_coeff_level ^ k) - k,  // Apply sign
-++                        scale,
-++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
-++                        shift);
-+                 }
-+             }
-+-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-+-
-+-            if (lc->cu.cu_transquant_bypass_flag ||
-+-                (lc->cu.pred_mode ==  MODE_INTRA  &&
-+-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
-+-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
-+-                 explicit_rdpcm_flag)
-+-                sign_hidden = 0;
-+             else
-+-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
-++#endif
-++            {
-++                int sign_hidden = may_hide_sign;
-++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
-++                uint32_t coeff_sign_flags;
-++                uint32_t coded_vals = 0;
-++                // Sum(abs(level[]))
-++                // In fact we only need the bottom bit and in some future
-++                // version that may be all we calculate
-++                unsigned int sum_abs;
-++
-++                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
-++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
-++
-++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
-++                    sign_hidden = 0;
-++
-++                // -- Start bypass block
-++
-++                bypass_start(s);
-++
-++                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
-++
-++                if (coded_vals != 0)
-++                {
-++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
-++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
-++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
-++                    int * level = levels - 1;
-++
-++                    do {
-++                        {
-++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
-++                            level += z;
-++                            coded_vals <<= z;
-++                        }
-+ 
-+-            if (first_greater1_coeff_idx != -1) {
-+-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
-+-            }
-+-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
-+-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
-+-            } else {
-+-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
-+-            }
-++                        {
-++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
-++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-++
-++                            sum_abs += last_coeff_abs_level_remaining + 1;
-++                            *level = trans_coeff_level;
-+ 
-+-            for (m = 0; m < n_end; m++) {
-+-                n = significant_coeff_flag_idx[m];
-+-                GET_COORD(offset, n);
-+-                if (m < 8) {
-+-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
-+-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
-+-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-+-
-+-                        trans_coeff_level += last_coeff_abs_level_remaining;
-+-                        if (trans_coeff_level > (3 << c_rice_param))
-+-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-+-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-+-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-+-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-+-                                lc->stat_coeff[sb_type]++;
-+-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-+-                                if (lc->stat_coeff[sb_type] > 0)
-+-                                    lc->stat_coeff[sb_type]--;
-+-                            rice_init = 1;
-++                            if (stat_coeff != NULL)
-++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-++                            stat_coeff = NULL;
-++
-++                            if (trans_coeff_level > (3 << c_rice_param) &&
-++                                (c_rice_param < 4 || rice_adaptation_enabled))
-++                                ++c_rice_param;
-+                         }
-+-                    }
-+-                } else {
-+-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-+-
-+-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
-+-                    if (trans_coeff_level > (3 << c_rice_param))
-+-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-+-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-+-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-+-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-+-                            lc->stat_coeff[sb_type]++;
-+-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-+-                            if (lc->stat_coeff[sb_type] > 0)
-+-                                lc->stat_coeff[sb_type]--;
-+-                        rice_init = 1;
-+-                    }
-++                    } while (coded_vals != 0);
-+                 }
-+-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
-+-                    sum_abs += trans_coeff_level;
-+-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
-+-                        trans_coeff_level = -trans_coeff_level;
-++
-++                // sign_hidden = 0 or 1 so we can combine the tests
-++                if ((sign_hidden & sum_abs) != 0) {
-++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
-+                 }
-+-                if (coeff_sign_flag >> 15)
-+-                    trans_coeff_level = -trans_coeff_level;
-+-                coeff_sign_flag <<= 1;
-+-                if(!lc->cu.cu_transquant_bypass_flag) {
-+-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
-+-                        if(y_c || x_c || log2_trafo_size < 4) {
-+-                            switch(log2_trafo_size) {
-+-                                case 3: pos = (y_c << 3) + x_c; break;
-+-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
-+-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
-+-                                default: pos = (y_c << 2) + x_c; break;
-+-                            }
-+-                            scale_m = scale_matrix[pos];
-+-                        } else {
-+-                            scale_m = dc_scale;
-+-                        }
-++
-++                bypass_finish(s);
-++
-++                // -- Finish bypass block
-++
-++                // Scale loop
-++                {
-++                    int m = nb_significant_coeff_flag - 1;
-++
-++                    // Deal with DC component (if any) first
-++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
-++                    {
-++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-++                        blk_coeffs[0] = trans_scale_sat(
-++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
-++                        --m;
-+                     }
-+-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
-+-                    if(trans_coeff_level < 0) {
-+-                        if((~trans_coeff_level) & 0xFffffffffff8000)
-+-                            trans_coeff_level = -32768;
-+-                    } else {
-+-                        if(trans_coeff_level & 0xffffffffffff8000)
-+-                            trans_coeff_level = 32767;
-++
-++#if !USE_N_END_1
-++                    // If N_END_! set then m was at least 1 initially
-++                    if (m >= 0)
-++#endif
-++                    {
-++                        do {
-++                            const xy_off_t * const xy_off = scan_xy_off +
-++                                significant_coeff_flag_idx[m];
-++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-++
-++                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
-++                                (levels[m] ^ k) - k,
-++                                scale,
-++                                blk_scale[xy_off->scale],
-++                                shift);
-++                        } while (--m >= 0);
-+                     }
-+                 }
-+-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
-++
-+             }
-+         }
-+-    }
-++    } while ((i = next_subset(s, i, c_idx_nz,
-++        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
-+ 
-+     if (lc->cu.cu_transquant_bypass_flag) {
-+         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+@@ -1496,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+         }
-+     } else {
-+-        if (transform_skip_flag) {
-++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
-+             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
-+                       log2_trafo_size == 2 &&
-+                       lc->cu.pred_mode == MODE_INTRA;
-+-- 
-+2.5.0
-+
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index e4acfa9..072e711 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -4,7 +4,8 @@ DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_c
-   0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch \
-   0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch \
-   hevcdsp_ARM_NEON_optimized_epel_functions.patch added_ARM_NEON_optimized_SAO_patches.patch \
--  pfcd_hevc_optimisations.patch
-+  pfcd_hevc_optimisations.patch \
-+  0001-Squashed-commit-of-the-following.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -66,6 +67,7 @@ ifeq ($(Configuration), Release)
-   ffmpg_config += --disable-debug
- endif
- 
-+ffmpg_config += --extra-cflags="-DRPI=1"
- 
- CLEAN_FILES=$(ARCHIVE) $(PLATFORM)
- 
-@@ -84,6 +86,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
- 	cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
- 	cd $(PLATFORM); patch -p1 < ../pfcd_hevc_optimisations.patch
-+	cd $(PLATFORM); patch -p1 < ../0001-Squashed-commit-of-the-following.patch
- 
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
-
-From 653e6185b0976bd50eea79f9834ede99db13e3aa Mon Sep 17 00:00:00 2001
-From: Rainer Hochecker <fernetmenta@online.de>
-Date: Tue, 14 Jul 2015 08:30:44 +0200
-Subject: [PATCH 65/93] fix high cpu load caused by false positive frame
- limiter
-
----
- xbmc/Application.cpp | 20 +++++++++-----------
- 1 file changed, 9 insertions(+), 11 deletions(-)
-
-diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index 212a5c7..1adbb01 100644
---- a/xbmc/Application.cpp
-+++ b/xbmc/Application.cpp
-@@ -1902,7 +1902,7 @@ void CApplication::Render()
- 
-   bool hasRendered = false;
-   bool limitFrames = false;
--  unsigned int singleFrameTime = 10; // default limit 100 fps
-+  unsigned int singleFrameTime = 40; // default limit 25 fps
-   bool vsync = true;
- 
-   // Whether externalplayer is playing and we're unfocused
-@@ -1916,24 +1916,22 @@ void CApplication::Render()
-     if (!extPlayerActive && g_graphicsContext.IsFullScreenVideo() && !m_pPlayer->IsPausedPlayback())
-     {
-       m_bPresentFrame = g_renderManager.HasFrame();
--      if (vsync_mode == VSYNC_DISABLED)
--        vsync = false;
-     }
-     else
-     {
-       // engage the frame limiter as needed
-       limitFrames = lowfps || extPlayerActive;
--      // DXMERGE - we checked for g_videoConfig.GetVSyncMode() before this
--      //           perhaps allowing it to be set differently than the UI option??
-+
-+      // TODO:
-+      // remove those useless modes, they don't do any good
-       if (vsync_mode == VSYNC_DISABLED || vsync_mode == VSYNC_VIDEO)
-       {
-         limitFrames = true; // not using vsync.
--        vsync = false;
-+        singleFrameTime = 10;
-       }
--      else if ((g_infoManager.GetFPS() > g_graphicsContext.GetFPS() + 10) && g_infoManager.GetFPS() > 1000.0f / singleFrameTime)
-+      else if ((g_infoManager.GetFPS() > g_graphicsContext.GetFPS() + 10) && g_infoManager.GetFPS() > 100.0f)
-       {
-         limitFrames = true; // using vsync, but it isn't working.
--        vsync = false;
-       }
- 
-       if (limitFrames)
-@@ -1957,7 +1955,10 @@ void CApplication::Render()
-   else if (vsync_mode == VSYNC_ALWAYS)
-     g_Windowing.SetVSync(true);
-   else if (vsync_mode != VSYNC_DRIVER)
-+  {
-     g_Windowing.SetVSync(false);
-+    vsync = false;
-+  }
- 
-   if (m_bPresentFrame && m_pPlayer->IsPlaying() && !m_pPlayer->IsPaused())
-     ResetScreenSaver();
-@@ -2025,9 +2026,6 @@ void CApplication::Render()
-   //fps limiter, make sure each frame lasts at least singleFrameTime milliseconds
-   if (limitFrames || !(flip || m_bPresentFrame))
-   {
--    if (!limitFrames)
--      singleFrameTime = 40; //if not flipping, loop at 25 fps
--
-     unsigned int frameTime = now - m_lastFrameTime;
-     if (frameTime < singleFrameTime)
-       Sleep(singleFrameTime - frameTime);
-
-From 7eae470ce134f19cb5002969ac3f7e85fcf5220d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 5 Aug 2015 13:43:25 +0100
-Subject: [PATCH 66/93] [dvdplayeraudio] Avoid busy spinning when queue is
- empty
-
----
- xbmc/cores/dvdplayer/DVDPlayerAudio.cpp | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp b/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-index 97a23a6..9f21a19 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-@@ -541,6 +541,8 @@ void CDVDPlayerAudio::Process()
-         m_dvdAudio.Drain();
-         m_dvdAudio.Flush();
-         m_stalled = true;
-+        // avoid busy spinning here
-+        Sleep(10);
-       }
- 
-       continue;
-
-From 492a2e7ac5fb1895b71b62f68918e74db053f0b9 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 6 Aug 2015 11:23:05 +0100
-Subject: [PATCH 67/93] [rbp] Make sync playback to display the default option
-
----
- system/settings/rbp.xml | 7 +++++++
- 1 file changed, 7 insertions(+)
-
-diff --git a/system/settings/rbp.xml b/system/settings/rbp.xml
-index 1506035..f2a6892 100644
---- a/system/settings/rbp.xml
-+++ b/system/settings/rbp.xml
-@@ -1,6 +1,13 @@
- <?xml version="1.0" encoding="utf-8" ?>
- <settings>
-   <section id="videos">
-+    <category id="videoplayer">
-+      <group id="3">
-+        <setting id="videoplayer.usedisplayasclock">
-+          <default>true</default>
-+        </setting>
-+      </group>
-+    </category>
-     <category id="videoacceleration">
-       <group id="1">
-         <visible>false</visible>
-
-From 3ff59db3bd9c43b037bbe89c72f5fd97f4563b71 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 8 Sep 2015 23:42:30 +0100
-Subject: [PATCH 68/93] [cec] Fixing initialisation issue found on Raspberry Pi
- with Buildroot
-
----
- tools/depends/target/libcec/Makefile               |  1 +
- ...ssue-found-on-Raspberry-Pi-with-Buildroot.patch | 24 ++++++++++++++++++++++
- 2 files changed, 25 insertions(+)
- create mode 100644 tools/depends/target/libcec/fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch
-
-diff --git a/tools/depends/target/libcec/Makefile b/tools/depends/target/libcec/Makefile
-index 5d1f933..4663faa 100644
---- a/tools/depends/target/libcec/Makefile
-+++ b/tools/depends/target/libcec/Makefile
-@@ -23,6 +23,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
- 	cd $(PLATFORM); patch -p1 < ../popcornmix.patch
- 	cd $(PLATFORM); patch -p1 < ../0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
-+	cd $(PLATFORM); patch -p1 < ../fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch
- 	cd $(PLATFORM)/build; $(CMAKE) -DBUILD_SHARED_LIBS=1 -DSKIP_PYTHON_WRAPPER:STRING=1 -DCMAKE_INSTALL_LIBDIR=$(PREFIX)/lib ..
- 
- $(LIBDYLIB): $(PLATFORM)
-diff --git a/tools/depends/target/libcec/fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch b/tools/depends/target/libcec/fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch
-new file mode 100644
-index 0000000..8f289f2
---- /dev/null
-+++ b/tools/depends/target/libcec/fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch
-@@ -0,0 +1,24 @@
-+From 9a252570dc3ca1f5b92a48542e29b2722550e670 Mon Sep 17 00:00:00 2001
-+From: Erwan LOUET <erwan.louet@orange.com>
-+Date: Fri, 4 Sep 2015 15:34:19 +0200
-+Subject: [PATCH] fixing initialisation issue found on Raspberry Pi with
-+ Buildroot
-+
-+---
-+ src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp | 3 ++-
-+ 1 file changed, 2 insertions(+), 1 deletion(-)
-+
-+diff --git a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-+index 6f0804d..95b4fef 100644
-+--- a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-++++ b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-+@@ -71,7 +71,8 @@ CRPiCECAdapterCommunication::CRPiCECAdapterCommunication(IAdapterCommunicationCa
-+     m_bLogicalAddressChanged(false),
-+     m_previousLogicalAddress(CECDEVICE_FREEUSE),
-+     m_bLogicalAddressRegistered(false),
-+-    m_bDisableCallbacks(false)
-++    m_bDisableCallbacks(false),
-++    m_bInitialised(false)
-+ {
-+   m_queue = new CRPiCECAdapterMessageQueue(this);
-+ }
-
-From c456ad03e68428ef849490c385cc069cb8dde87d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 9 Dec 2015 13:31:14 +0000
-Subject: [PATCH 69/93] [mmalcodec] Fail to open when width is invalid. Can
- happen with mpegts files
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 3345685..5386b4a 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -532,6 +532,10 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-     CLog::Log(LOGDEBUG, "%s::%s usemmal:%d software:%d %dx%d pool:%p", CLASSNAME, __func__, CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL), hints.software, hints.width, hints.height, options.m_opaque_pointer);
- 
-+  // This occurs at start of m2ts files before streams have been fully identified - just ignore
-+  if (!hints.width)
-+    return false;
-+
-   // we always qualify even if DVDFactoryCodec does this too.
-   if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
-     return false;
-
-From 9bcbb1f3c5e687ccf4aeecbe583eb7643f5d48c8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 21 Dec 2015 18:34:06 +0000
-Subject: [PATCH 70/93] [mmalrender] Force a SetVideoRect after a reconfigure
-
-There has been an issue with dvd stills and a hdmi refresh rate change.
-The hdmi mode change loses the currently displayed picture.
-Not an issue for normal video playback as another picture will be along soon.
-Not the case in DVD menus.
-
-SetVideoRect makes the last picture redisplay and so fixes up the menu.
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index adf6f73..ad3f66f 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -286,6 +286,10 @@ bool CMMALRenderer::Configure(unsigned int width, unsigned int height, unsigned
-   m_fps = fps;
-   m_iFlags = flags;
- 
-+  // cause SetVideoRect to trigger - needed after a hdmi mode change
-+  m_src_rect.SetRect(0, 0, 0, 0);
-+  m_dst_rect.SetRect(0, 0, 0, 0);
-+
-   CLog::Log(LOGDEBUG, "%s::%s - %dx%d->%dx%d@%.2f flags:%x format:%d ext:%x orient:%d", CLASSNAME, __func__, width, height, d_width, d_height, fps, flags, format, extended_format, orientation);
-   if (format != RENDER_FMT_YUV420P && format != RENDER_FMT_BYPASS && format != RENDER_FMT_MMAL)
-   {
-
-From 902a0514368d1ec48107d5951ee990b93cb4282f Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 31 Mar 2015 17:31:47 +0100
-Subject: [PATCH 71/93] mmalcodec: Add SetCodecControl function
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 10 +++++++++-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  2 ++
- 2 files changed, 11 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 5386b4a..799b708 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -130,6 +130,7 @@ CMMALVideo::CMMALVideo()
-   m_speed = DVD_PLAYSPEED_NORMAL;
-   m_fps = 0.0f;
-   m_num_decoded = 0;
-+  m_codecControlFlags = 0;
- }
- 
- CMMALVideo::~CMMALVideo()
-@@ -875,7 +876,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-   }
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) inputs(%d) slept(%d) queued(%.2f) (%.2f:%.2f) full(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), mmal_queue_length(m_dec_input_pool->queue), slept, queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) inputs(%d) slept(%d) queued(%.2f) (%.2f:%.2f) full(%d) flags(%x)", CLASSNAME, __func__, ret, m_output_ready.size(), mmal_queue_length(m_dec_input_pool->queue), slept, queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full, m_codecControlFlags);
- 
-   return ret;
- }
-@@ -939,6 +940,7 @@ void CMMALVideo::Reset(void)
-   }
-   m_decoderPts = DVD_NOPTS_VALUE;
-   m_demuxerPts = DVD_NOPTS_VALUE;
-+  m_codecControlFlags = 0;
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-@@ -1049,3 +1051,9 @@ bool CMMALVideo::GetCodecStats(double &pts, int &droppedPics)
-   droppedPics= -1;
-   return false;
- }
-+
-+void CMMALVideo::SetCodecControl(int flags)
-+{
-+  CSingleLock lock(m_sharedSection);
-+  m_codecControlFlags = flags;
-+}
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index f4df09c..37d0868 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -79,6 +79,7 @@ public:
-   virtual void SetDropState(bool bDrop);
-   virtual const char* GetName(void) { return m_pFormatName ? m_pFormatName:"mmal-xxx"; }
-   virtual bool GetCodecStats(double &pts, int &droppedPics);
-+  virtual void SetCodecControl(int flags);
-   virtual void SetSpeed(int iSpeed);
- 
-   // MMAL decoder callback routines.
-@@ -121,6 +122,7 @@ protected:
-   double            m_demuxerPts;
-   double            m_decoderPts;
-   int               m_speed;
-+  int               m_codecControlFlags;
- 
-   CCriticalSection m_sharedSection;
-   MMAL_COMPONENT_T *m_dec;
-
-From cae99d2093015ba70d1a387e83ed6214393fc31a Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 27 Dec 2015 18:44:22 +0000
-Subject: [PATCH 72/93] mmalcodec: Switch to a condition variable when blocking
- waiting for a picture
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 38 ++++++++++++----------
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  3 +-
- 2 files changed, 23 insertions(+), 18 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 799b708..3579966 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -102,7 +102,6 @@ CMMALVideo::CMMALVideo()
- {
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-     CLog::Log(LOGDEBUG, "%s::%s %p", CLASSNAME, __func__, this);
--  pthread_mutex_init(&m_output_mutex, NULL);
- 
-   m_decoded_width = 0;
-   m_decoded_height = 0;
-@@ -141,7 +140,6 @@ CMMALVideo::~CMMALVideo()
-     Dispose();
- 
-   CSingleLock lock(m_sharedSection);
--  pthread_mutex_destroy(&m_output_mutex);
- 
-   if (m_deint && m_deint->control && m_deint->control->is_enabled)
-     mmal_port_disable(m_deint->control);
-@@ -285,9 +283,11 @@ void CMMALVideo::dec_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf
-         omvb->width = m_decoded_width;
-         omvb->height = m_decoded_height;
-         omvb->m_aspect_ratio = m_aspect_ratio;
--        pthread_mutex_lock(&m_output_mutex);
--        m_output_ready.push(omvb);
--        pthread_mutex_unlock(&m_output_mutex);
-+        {
-+          CSingleLock lock(m_output_mutex);
-+          m_output_ready.push(omvb);
-+          m_output_cond.notifyAll();
-+        }
-         kept = true;
-       }
-     }
-@@ -867,7 +867,8 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     {
-       // otherwise we busy spin
-       CSingleExit unlock(m_sharedSection);
--      Sleep(10);
-+      CSingleLock lock(m_output_mutex);
-+      m_output_cond.wait(lock, 10);
-     }
-     if (!m_output_ready.empty())
-       ret |= VC_PICTURE;
-@@ -916,14 +917,16 @@ void CMMALVideo::Reset(void)
-   while (1)
-   {
-     CMMALVideoBuffer *buffer = NULL;
--    pthread_mutex_lock(&m_output_mutex);
--    // fetch a output buffer and pop it off the ready list
--    if (!m_output_ready.empty())
-     {
--      buffer = m_output_ready.front();
--      m_output_ready.pop();
-+      CSingleLock lock(m_output_mutex);
-+      // fetch a output buffer and pop it off the ready list
-+      if (!m_output_ready.empty())
-+      {
-+        buffer = m_output_ready.front();
-+        m_output_ready.pop();
-+      }
-+      m_output_cond.notifyAll();
-     }
--    pthread_mutex_unlock(&m_output_mutex);
-     if (buffer)
-     {
-       buffer->Acquire();
-@@ -984,11 +987,12 @@ bool CMMALVideo::GetPicture(DVDVideoPicture* pDvdVideoPicture)
-   {
-     CMMALVideoBuffer *buffer;
-     // fetch a output buffer and pop it off the ready list
--    pthread_mutex_lock(&m_output_mutex);
--    buffer = m_output_ready.front();
--    m_output_ready.pop();
--    pthread_mutex_unlock(&m_output_mutex);
--
-+    {
-+      CSingleLock lock(m_output_mutex);
-+      buffer = m_output_ready.front();
-+      m_output_ready.pop();
-+      m_output_cond.notifyAll();
-+    }
-     assert(buffer->mmal_buffer);
-     memset(pDvdVideoPicture, 0, sizeof *pDvdVideoPicture);
-     pDvdVideoPicture->format = RENDER_FMT_MMAL;
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index 37d0868..ca28c6f 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -105,7 +105,8 @@ protected:
-   const char        *m_pFormatName;
- 
-   // mmal output buffers (video frames)
--  pthread_mutex_t   m_output_mutex;
-+  CCriticalSection m_output_mutex;
-+  XbmcThreads::ConditionVariable m_output_cond;
-   std::queue<CMMALVideoBuffer*> m_output_ready;
- 
-   // initialize mmal and get decoder component
-
-From ec6e9acc113651fc3408c9fc32d188f41d8de64a Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 16 Jan 2016 16:46:03 +0000
-Subject: [PATCH 73/93] omxaudio: Avoid reporting a spurious cached value
-
-Avoids seek bar showing zero after a seek
----
- xbmc/cores/omxplayer/OMXAudio.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXAudio.cpp b/xbmc/cores/omxplayer/OMXAudio.cpp
-index 70d0866..052b5ef 100644
---- a/xbmc/cores/omxplayer/OMXAudio.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudio.cpp
-@@ -1335,7 +1335,7 @@ float COMXAudio::GetDelay()
-   if (m_last_pts != DVD_NOPTS_VALUE && m_av_clock)
-     stamp = m_av_clock->OMXMediaTime();
-   // if possible the delay is current media time - time of last submitted packet
--  if (stamp != DVD_NOPTS_VALUE)
-+  if (stamp != DVD_NOPTS_VALUE && stamp != 0.0)
-   {
-     ret = (m_last_pts - stamp) * (1.0 / DVD_TIME_BASE);
-   }
-
-From 711b4b11b49c9ebc255e565462e3ac665a1cda8c Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 21 Dec 2015 22:17:25 +0000
-Subject: [PATCH 74/93] [omximage] Fall back to arm jpeg encode/decode when gpu
- is busy
-
----
- xbmc/cores/omxplayer/OMXImage.cpp | 50 ++++++++++++++++++++++++++++++++-------
- xbmc/cores/omxplayer/OMXImage.h   |  7 ++++++
- 2 files changed, 48 insertions(+), 9 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXImage.cpp b/xbmc/cores/omxplayer/OMXImage.cpp
-index a01c435..e592989 100644
---- a/xbmc/cores/omxplayer/OMXImage.cpp
-+++ b/xbmc/cores/omxplayer/OMXImage.cpp
-@@ -56,12 +56,17 @@ static XbmcThreads::ConditionVariable g_count_cond;
- static CCriticalSection               g_count_lock;
- static int g_count_val;
- 
--static void limit_calls_enter()
-+static bool limit_calls_enter()
- {
-   CSingleLock lock(g_count_lock);
-+  // on Pi2 fall back to arm decode if the queue is getting big
-+  if (g_RBP.RasberryPiVersion() > 1 && g_count_val >= 2)
-+    return false;
-+
-   while (g_count_val >= 3)
-     g_count_cond.wait(lock);
-   g_count_val++;
-+  return true;
- }
- 
- static void limit_calls_leave()
-@@ -112,6 +117,9 @@ bool COMXImage::CreateThumbnailFromSurface(unsigned char* buffer, unsigned int w
-       unsigned int format, unsigned int pitch, const std::string& destFile)
- {
-   COMXImageEnc omxImageEnc;
-+  if (!omxImageEnc.Gpu())
-+    return false;
-+
-   bool ret = omxImageEnc.CreateThumbnailFromSurface(buffer, width, height, format, pitch, destFile);
-   if (!ret)
-     CLog::Log(LOGNOTICE, "%s: unable to create thumbnail %s %dx%d", __func__, destFile.c_str(), width, height);
-@@ -205,6 +213,8 @@ bool COMXImage::CreateThumb(const std::string& srcFile, unsigned int maxHeight,
-   bool okay = false;
-   COMXImageFile file;
-   COMXImageReEnc reenc;
-+  if (!reenc.Gpu())
-+    return false;
-   void *pDestBuffer;
-   unsigned int nDestSize;
-   int orientation = additional_info == "flipped" ? 1:0;
-@@ -310,6 +320,9 @@ bool COMXImage::DecodeJpegToTexture(COMXImageFile *file, unsigned int width, uns
-   bool ret = false;
-   COMXTexture omx_image;
- 
-+  if (!omx_image.Gpu())
-+    return false;
-+
-   struct textureinfo *tex = new struct textureinfo;
-   if (!tex)
-     return NULL;
-@@ -924,7 +937,7 @@ bool COMXImageFile::ReadFile(const std::string& inputFile, int orientation)
- 
- COMXImageDec::COMXImageDec()
- {
--  limit_calls_enter();
-+  m_gpu = limit_calls_enter();
-   m_decoded_buffer = NULL;
-   OMX_INIT_STRUCTURE(m_decoded_format);
-   m_success = false;
-@@ -936,7 +949,8 @@ COMXImageDec::~COMXImageDec()
- 
-   OMX_INIT_STRUCTURE(m_decoded_format);
-   m_decoded_buffer = NULL;
--  limit_calls_leave();
-+  if (m_gpu)
-+    limit_calls_leave();
- }
- 
- void COMXImageDec::Close()
-@@ -1086,6 +1100,9 @@ bool COMXImageDec::HandlePortSettingChange(unsigned int resize_width, unsigned i
- 
- bool COMXImageDec::Decode(const uint8_t *demuxer_content, unsigned demuxer_bytes, unsigned width, unsigned height, unsigned stride, void *pixels)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   CSingleLock lock(m_OMXSection);
-   OMX_ERRORTYPE omx_err = OMX_ErrorNone;
-   OMX_BUFFERHEADERTYPE *omx_buffer = NULL;
-@@ -1223,7 +1240,7 @@ bool COMXImageDec::Decode(const uint8_t *demuxer_content, unsigned demuxer_bytes
- 
- COMXImageEnc::COMXImageEnc()
- {
--  limit_calls_enter();
-+  m_gpu = limit_calls_enter();
-   CSingleLock lock(m_OMXSection);
-   OMX_INIT_STRUCTURE(m_encoded_format);
-   m_encoded_buffer = NULL;
-@@ -1247,11 +1264,15 @@ COMXImageEnc::~COMXImageEnc()
-       m_omx_encoder.Deinitialize();
-     }
-   }
--  limit_calls_leave();
-+  if (m_gpu)
-+    limit_calls_leave();
- }
- 
- bool COMXImageEnc::Encode(unsigned char *buffer, int size, unsigned width, unsigned height, unsigned int pitch)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   CSingleLock lock(m_OMXSection);
- 
-   unsigned int demuxer_bytes = 0;
-@@ -1432,6 +1453,9 @@ bool COMXImageEnc::Encode(unsigned char *buffer, int size, unsigned width, unsig
- bool COMXImageEnc::CreateThumbnailFromSurface(unsigned char* buffer, unsigned int width, unsigned int height,
-     unsigned int format, unsigned int pitch, const std::string& destFile)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   if(format != XB_FMT_A8R8G8B8 || !buffer)
-   {
-     CLog::Log(LOGDEBUG, "%s::%s : %s failed format=0x%x\n", CLASSNAME, __func__, destFile.c_str(), format);
-@@ -1465,7 +1489,7 @@ bool COMXImageEnc::CreateThumbnailFromSurface(unsigned char* buffer, unsigned in
- 
- COMXImageReEnc::COMXImageReEnc()
- {
--  limit_calls_enter();
-+  m_gpu = limit_calls_enter();
-   m_encoded_buffer = NULL;
-   m_pDestBuffer = NULL;
-   m_nDestAllocSize = 0;
-@@ -1479,7 +1503,8 @@ COMXImageReEnc::~COMXImageReEnc()
-     free (m_pDestBuffer);
-   m_pDestBuffer = NULL;
-   m_nDestAllocSize = 0;
--  limit_calls_leave();
-+  if (m_gpu)
-+    limit_calls_leave();
- }
- 
- void COMXImageReEnc::Close()
-@@ -1771,6 +1796,9 @@ bool COMXImageReEnc::HandlePortSettingChange(unsigned int resize_width, unsigned
- 
- bool COMXImageReEnc::ReEncode(COMXImageFile &srcFile, unsigned int maxWidth, unsigned int maxHeight, void * &pDestBuffer, unsigned int &nDestSize)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   CSingleLock lock(m_OMXSection);
-   OMX_ERRORTYPE omx_err = OMX_ErrorNone;
- 
-@@ -1943,14 +1971,15 @@ bool COMXImageReEnc::ReEncode(COMXImageFile &srcFile, unsigned int maxWidth, uns
- 
- COMXTexture::COMXTexture()
- {
--  limit_calls_enter();
-+  m_gpu = limit_calls_enter();
-   m_success = false;
- }
- 
- COMXTexture::~COMXTexture()
- {
-   Close();
--  limit_calls_leave();
-+  if (m_gpu)
-+    limit_calls_leave();
- }
- 
- void COMXTexture::Close()
-@@ -2134,6 +2163,9 @@ bool COMXTexture::HandlePortSettingChange(unsigned int resize_width, unsigned in
- 
- bool COMXTexture::Decode(const uint8_t *demuxer_content, unsigned demuxer_bytes, unsigned int width, unsigned int height, void *egl_image)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   CSingleLock lock(m_OMXSection);
-   OMX_ERRORTYPE omx_err = OMX_ErrorNone;
- 
-diff --git a/xbmc/cores/omxplayer/OMXImage.h b/xbmc/cores/omxplayer/OMXImage.h
-index a93aa82..6f38dbc 100644
---- a/xbmc/cores/omxplayer/OMXImage.h
-+++ b/xbmc/cores/omxplayer/OMXImage.h
-@@ -133,6 +133,7 @@ protected:
-   OMX_PARAM_PORTDEFINITIONTYPE  m_decoded_format;
-   CCriticalSection              m_OMXSection;
-   bool                          m_success;
-+  bool                          m_gpu;
- };
- 
- class COMXImageEnc
-@@ -144,6 +145,7 @@ public:
-   // Required overrides
-   bool CreateThumbnailFromSurface(unsigned char* buffer, unsigned int width, unsigned int height,
-       unsigned int format, unsigned int pitch, const std::string& destFile);
-+  bool Gpu() { return m_gpu; }
- protected:
-   bool Encode(unsigned char *buffer, int size, unsigned int width, unsigned int height, unsigned int pitch);
-   // Components
-@@ -152,6 +154,7 @@ protected:
-   OMX_PARAM_PORTDEFINITIONTYPE  m_encoded_format;
-   CCriticalSection              m_OMXSection;
-   bool                          m_success;
-+  bool                          m_gpu;
- };
- 
- class COMXImageReEnc
-@@ -163,6 +166,7 @@ public:
-   // Required overrides
-   void Close();
-   bool ReEncode(COMXImageFile &srcFile, unsigned int width, unsigned int height, void * &pDestBuffer, unsigned int &nDestSize);
-+  bool Gpu() { return m_gpu; }
- protected:
-   bool HandlePortSettingChange(unsigned int resize_width, unsigned int resize_height, int orientation, bool port_settings_changed);
-   // Components
-@@ -176,6 +180,7 @@ protected:
-   void                          *m_pDestBuffer;
-   unsigned int                  m_nDestAllocSize;
-   bool                          m_success;
-+  bool                          m_gpu;
- };
- 
- class COMXTexture
-@@ -187,6 +192,7 @@ public:
-   // Required overrides
-   void Close(void);
-   bool Decode(const uint8_t *data, unsigned size, unsigned int width, unsigned int height, void *egl_image);
-+  bool Gpu() { return m_gpu; }
- protected:
-   bool HandlePortSettingChange(unsigned int resize_width, unsigned int resize_height, void *egl_image, bool port_settings_changed);
- 
-@@ -201,6 +207,7 @@ protected:
-   OMX_BUFFERHEADERTYPE *m_egl_buffer;
-   CCriticalSection              m_OMXSection;
-   bool              m_success;
-+  bool              m_gpu;
- };
- 
- extern COMXImage g_OMXImage;
-
-From dfb7b32bb3d8220a30ad67a26dfc388b4c4d9f43 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 2 Jan 2016 18:08:16 +0000
-Subject: [PATCH 75/93] [mmalcodec] Don't send zero sized extradata
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 3579966..0f5c1b7 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -501,7 +501,7 @@ bool CMMALVideo::SendCodecConfigData()
- {
-   CSingleLock lock(m_sharedSection);
-   MMAL_STATUS_T status;
--  if (!m_dec_input_pool)
-+  if (!m_dec_input_pool || !m_hints.extrasize)
-     return true;
-   // send code config data
-   MMAL_BUFFER_HEADER_T *buffer = mmal_queue_timedwait(m_dec_input_pool->queue, 500);
-
-From 374227275b47f31ca0cca887a12e5cce187cdd55 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 3 Jan 2016 19:12:16 +0000
-Subject: [PATCH 76/93] stereoscopicmanager: Ensure we don't have a stale value
- of videoplayer stereoscopic mode
-
----
- xbmc/guilib/StereoscopicsManager.cpp | 30 +++++++++++++++++++++---------
- xbmc/guilib/StereoscopicsManager.h   |  2 ++
- 2 files changed, 23 insertions(+), 9 deletions(-)
-
-diff --git a/xbmc/guilib/StereoscopicsManager.cpp b/xbmc/guilib/StereoscopicsManager.cpp
-index d9e0fa4..4942f01 100644
---- a/xbmc/guilib/StereoscopicsManager.cpp
-+++ b/xbmc/guilib/StereoscopicsManager.cpp
-@@ -140,12 +140,7 @@ void CStereoscopicsManager::SetStereoMode(const RENDER_STEREO_MODE &mode)
- 
-   // resolve automatic mode before applying
-   if (mode == RENDER_STEREO_MODE_AUTO)
--  {
--    if (g_infoManager.EvaluateBool("videoplayer.isstereoscopic"))
--      applyMode = GetStereoModeOfPlayingVideo();
--    else
--      applyMode = RENDER_STEREO_MODE_OFF;
--  }
-+    applyMode = GetStereoModeOfPlayingVideo();
- 
-   if (applyMode != currentMode && applyMode >= RENDER_STEREO_MODE_OFF)
-   {
-@@ -209,7 +204,7 @@ RENDER_STEREO_MODE CStereoscopicsManager::GetStereoModeByUserChoice(const std::s
- {
-   RENDER_STEREO_MODE mode = GetStereoMode();
-   // if no stereo mode is set already, suggest mode of current video by preselecting it
--  if (mode == RENDER_STEREO_MODE_OFF && g_infoManager.EvaluateBool("videoplayer.isstereoscopic"))
-+  if (mode == RENDER_STEREO_MODE_OFF)
-     mode = GetStereoModeOfPlayingVideo();
- 
-   CGUIDialogSelect* pDlgSelect = (CGUIDialogSelect*)g_windowManager.GetWindow(WINDOW_DIALOG_SELECT);
-@@ -254,8 +249,8 @@ RENDER_STEREO_MODE CStereoscopicsManager::GetStereoModeByUserChoice(const std::s
- RENDER_STEREO_MODE CStereoscopicsManager::GetStereoModeOfPlayingVideo(void)
- {
-   RENDER_STEREO_MODE mode = RENDER_STEREO_MODE_OFF;
-+  std::string playerMode = GetVideoStereoMode();
- 
--  std::string playerMode = g_infoManager.GetLabel(VIDEOPLAYER_STEREOSCOPIC_MODE);
-   if (!playerMode.empty())
-   {
-     int convertedMode = ConvertVideoToGuiStereoMode(playerMode);
-@@ -504,6 +499,23 @@ void CStereoscopicsManager::ApplyStereoMode(const RENDER_STEREO_MODE &mode, bool
-   }
- }
- 
-+std::string CStereoscopicsManager::GetVideoStereoMode()
-+{
-+  std::string playerMode;
-+  if (g_application.m_pPlayer->IsPlaying())
-+  {
-+    SPlayerVideoStreamInfo videoInfo;
-+    g_application.m_pPlayer->GetVideoStreamInfo(videoInfo);
-+    playerMode = videoInfo.stereoMode;
-+  }
-+  return playerMode;
-+}
-+
-+bool CStereoscopicsManager::IsVideoStereoscopic()
-+{
-+  return !GetVideoStereoMode().empty();
-+}
-+
- void CStereoscopicsManager::OnPlaybackStarted(void)
- {
-   STEREOSCOPIC_PLAYBACK_MODE playbackMode = (STEREOSCOPIC_PLAYBACK_MODE) CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_STEREOSCOPICPLAYBACKMODE);
-@@ -513,7 +525,7 @@ void CStereoscopicsManager::OnPlaybackStarted(void)
-   if (playbackMode == STEREOSCOPIC_PLAYBACK_MODE_IGNORE && mode == RENDER_STEREO_MODE_OFF)
-     return;
- 
--  if (!g_infoManager.EvaluateBool("videoplayer.isstereoscopic"))
-+  if (!CStereoscopicsManager::IsVideoStereoscopic())
-   {
-     // exit stereo mode if started item is not stereoscopic
-     // and if user prefers to stop 3D playback when movie is finished
-diff --git a/xbmc/guilib/StereoscopicsManager.h b/xbmc/guilib/StereoscopicsManager.h
-index ec2310f..f090bb9 100644
---- a/xbmc/guilib/StereoscopicsManager.h
-+++ b/xbmc/guilib/StereoscopicsManager.h
-@@ -92,6 +92,8 @@ private:
-   void ApplyStereoMode(const RENDER_STEREO_MODE &mode, bool notify = true);
-   void OnPlaybackStarted(void);
-   void OnPlaybackStopped(void);
-+  std::string GetVideoStereoMode();
-+  bool IsVideoStereoscopic();
- 
-   RENDER_STEREO_MODE m_stereoModeSetByUser;
-   RENDER_STEREO_MODE m_lastStereoModeSetByUser;
-
-From d33bcf6304a60bfbdbc993eadab11505ae675851 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 16 Sep 2015 19:05:12 +0100
-Subject: [PATCH 77/93] [3d] Make MVC a valid 3D filename tag
-
----
- xbmc/guilib/StereoscopicsManager.cpp | 9 +++++++++
- xbmc/settings/AdvancedSettings.cpp   | 2 ++
- xbmc/settings/AdvancedSettings.h     | 1 +
- 3 files changed, 12 insertions(+)
-
-diff --git a/xbmc/guilib/StereoscopicsManager.cpp b/xbmc/guilib/StereoscopicsManager.cpp
-index 4942f01..ff67d0d 100644
---- a/xbmc/guilib/StereoscopicsManager.cpp
-+++ b/xbmc/guilib/StereoscopicsManager.cpp
-@@ -197,6 +197,15 @@ std::string CStereoscopicsManager::DetectStereoModeByString(const std::string &n
-   if (re.RegFind(searchString) > -1)
-     stereoMode = "top_bottom";
- 
-+  if (!re.RegComp(g_advancedSettings.m_stereoscopicregex_mvc.c_str()))
-+  {
-+    CLog::Log(LOGERROR, "%s: Invalid RegExp for matching 3d MVC content:'%s'", __FUNCTION__, g_advancedSettings.m_stereoscopicregex_mvc.c_str());
-+    return stereoMode;
-+  }
-+
-+  if (re.RegFind(searchString) > -1)
-+    stereoMode = "left_right";
-+
-   return stereoMode;
- }
- 
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 8045a03..aeea13b 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -388,6 +388,7 @@ void CAdvancedSettings::Initialize()
-   m_stereoscopicregex_3d = "[-. _]3d[-. _]";
-   m_stereoscopicregex_sbs = "[-. _]h?sbs[-. _]";
-   m_stereoscopicregex_tab = "[-. _]h?tab[-. _]";
-+  m_stereoscopicregex_mvc = "[-. _]h?mvc[-. _]";
- 
-   m_videoAssFixedWorks = false;
- 
-@@ -514,6 +515,7 @@ void CAdvancedSettings::ParseSettingsFile(const std::string &file)
-     XMLUtils::GetString(pElement, "stereoscopicregex3d", m_stereoscopicregex_3d);
-     XMLUtils::GetString(pElement, "stereoscopicregexsbs", m_stereoscopicregex_sbs);
-     XMLUtils::GetString(pElement, "stereoscopicregextab", m_stereoscopicregex_tab);
-+    XMLUtils::GetString(pElement, "stereoscopicregexmvc", m_stereoscopicregex_mvc);
-     XMLUtils::GetFloat(pElement, "subsdelayrange", m_videoSubsDelayRange, 10, 600);
-     XMLUtils::GetFloat(pElement, "audiodelayrange", m_videoAudioDelayRange, 10, 600);
-     XMLUtils::GetInt(pElement, "blackbarcolour", m_videoBlackBarColour, 0, 255);
-diff --git a/xbmc/settings/AdvancedSettings.h b/xbmc/settings/AdvancedSettings.h
-index 93de9bd..4da88f4 100644
---- a/xbmc/settings/AdvancedSettings.h
-+++ b/xbmc/settings/AdvancedSettings.h
-@@ -386,6 +386,7 @@ class CAdvancedSettings : public ISettingCallback, public ISettingsHandler
-     std::string m_stereoscopicregex_3d;
-     std::string m_stereoscopicregex_sbs;
-     std::string m_stereoscopicregex_tab;
-+    std::string m_stereoscopicregex_mvc;
- 
-     /*!< @brief position behavior of ass subtitiles when setting "subtitle position on screen" set to "fixed"
-     True to show at the fixed position set in video calibration
-
-From ff22ccfba36a15d2ed383bf5543f2dd3b9c6a618 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 24 Jan 2016 16:42:04 +0000
-Subject: [PATCH 78/93] fixup! [build] Add patches to ffmpeg for native build
-
----
- tools/depends/target/ffmpeg/autobuild.sh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tools/depends/target/ffmpeg/autobuild.sh b/tools/depends/target/ffmpeg/autobuild.sh
-index f6d4c3b..cc59d17 100755
---- a/tools/depends/target/ffmpeg/autobuild.sh
-+++ b/tools/depends/target/ffmpeg/autobuild.sh
-@@ -131,7 +131,7 @@ patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
- patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
- patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
- patch -p1 < ../pfcd_hevc_optimisations.patch
--patch -p1 < ../add_h264_MVC_support.patch
-+patch -p1 < ../0001-Squashed-commit-of-the-following.patch
- 
- CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
- ./configure --prefix=$FFMPEG_PREFIX \
-
-From 8bcf9f72ff12412fdc4c8139be071c2448d51ae7 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 26 Jan 2016 19:58:30 +0000
-Subject: [PATCH 79/93] OMXAudio: Make use of m_bGotFrame to skip decoding when
- full
-
----
- xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-index 33c4c6a..b9dab89 100644
---- a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-@@ -158,7 +158,8 @@ int COMXAudioCodecOMX::Decode(BYTE* pData, int iSize, double dts, double pts)
-   if (!m_pCodecContext) return -1;
- 
-   AVPacket avpkt;
--  m_bGotFrame = false;
-+  if (m_bGotFrame)
-+    return 0;
-   av_init_packet(&avpkt);
-   avpkt.data = pData;
-   avpkt.size = iSize;
-@@ -257,6 +258,7 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-       outputSize = 0;
-     }
-   }
-+  m_bGotFrame = false;
-   int desired_size = AUDIO_DECODE_OUTPUT_BUFFER * (m_pCodecContext->channels * GetBitsPerSample()) >> (rounded_up_channels_shift[m_pCodecContext->channels] + 4);
- 
-   if (m_bFirstFrame)
-@@ -274,7 +276,6 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-   if (m_iBufferOutputUsed + outputSize > desired_size || m_bNoConcatenate)
-   {
-      int ret = m_iBufferOutputUsed;
--     m_bGotFrame = false;
-      m_iBufferOutputUsed = 0;
-      dts = m_dts;
-      pts = m_pts;
-
-From 7044ba837edb2060a28bf534f5327d90e1c545e5 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 26 Jan 2016 20:01:18 +0000
-Subject: [PATCH 80/93] OMXAudio: Handle GetData before adding the next buffer
- so we can be sure it fits
-
----
- xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp | 28 ++++++++++++++--------------
- 1 file changed, 14 insertions(+), 14 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-index b9dab89..f150dc6 100644
---- a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-@@ -206,12 +206,24 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-   /* output audio will be packed */
-   int outputSize = av_samples_get_buffer_size(&outLineSize, m_pCodecContext->channels, m_pFrame1->nb_samples, m_desiredSampleFormat, 1);
- 
-+  // if this buffer won't fit then flush out what we have
-+  int desired_size = AUDIO_DECODE_OUTPUT_BUFFER * (m_pCodecContext->channels * GetBitsPerSample()) >> (rounded_up_channels_shift[m_pCodecContext->channels] + 4);
-+  if (m_iBufferOutputUsed && (m_iBufferOutputUsed + outputSize > desired_size || m_bNoConcatenate))
-+  {
-+     int ret = m_iBufferOutputUsed;
-+     m_iBufferOutputUsed = 0;
-+     dts = m_dts;
-+     pts = m_pts;
-+     *dst = m_pBufferOutput;
-+     return ret;
-+  }
-+  m_frameSize = outputSize;
-+
-   if (m_iBufferOutputAlloced < m_iBufferOutputUsed + outputSize)
-   {
-      m_pBufferOutput = (BYTE*)av_realloc(m_pBufferOutput, m_iBufferOutputUsed + outputSize + FF_INPUT_BUFFER_PADDING_SIZE);
-      m_iBufferOutputAlloced = m_iBufferOutputUsed + outputSize;
-   }
--  *dst = m_pBufferOutput;
- 
-   /* need to convert format */
-   if(m_pCodecContext->sample_fmt != m_desiredSampleFormat)
-@@ -259,28 +271,16 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-     }
-   }
-   m_bGotFrame = false;
--  int desired_size = AUDIO_DECODE_OUTPUT_BUFFER * (m_pCodecContext->channels * GetBitsPerSample()) >> (rounded_up_channels_shift[m_pCodecContext->channels] + 4);
- 
-   if (m_bFirstFrame)
-   {
--    CLog::Log(LOGDEBUG, "COMXAudioCodecOMX::GetData size=%d/%d line=%d/%d buf=%p, desired=%d", inputSize, outputSize, inLineSize, outLineSize, *dst, desired_size);
-+    CLog::Log(LOGDEBUG, "COMXAudioCodecOMX::GetData size=%d/%d line=%d/%d buf=%p, desired=%d", inputSize, outputSize, inLineSize, outLineSize, m_pBufferOutput, desired_size);
-     m_bFirstFrame = false;
-   }
-   m_iBufferOutputUsed += outputSize;
- 
-   if (!m_bNoConcatenate && m_pCodecContext->sample_fmt == AV_SAMPLE_FMT_FLTP && m_frameSize && (int)m_frameSize != outputSize)
-     CLog::Log(LOGERROR, "COMXAudioCodecOMX::GetData Unexpected change of size (%d->%d)", m_frameSize, outputSize);
--  m_frameSize = outputSize;
--
--  // if next buffer submitted won't fit then flush it out
--  if (m_iBufferOutputUsed + outputSize > desired_size || m_bNoConcatenate)
--  {
--     int ret = m_iBufferOutputUsed;
--     m_iBufferOutputUsed = 0;
--     dts = m_dts;
--     pts = m_pts;
--     return ret;
--  }
-   return 0;
- }
- 
-
-From c4abc577bf4c12d48cc800930c2d292c0a65031f Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 26 Jan 2016 20:03:49 +0000
-Subject: [PATCH 81/93] OMXAudio: Handle changes in decoded audio size
- correctly
-
----
- xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp | 14 +++++++-------
- 1 file changed, 7 insertions(+), 7 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-index f150dc6..4956b5b 100644
---- a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-@@ -103,10 +103,6 @@ bool COMXAudioCodecOMX::Open(CDVDStreamInfo &hints)
-   if (m_pCodecContext->request_channel_layout)
-     CLog::Log(LOGNOTICE,"COMXAudioCodecOMX::Open() Requesting channel layout of %x", (unsigned)m_pCodecContext->request_channel_layout);
- 
--  // vorbis and wma2v2 have variable sized planar output, so skip concatenation
--  if (hints.codec == AV_CODEC_ID_VORBIS || hints.codec == AV_CODEC_ID_WMAV2)
--    m_bNoConcatenate = true;
--
-   if(m_pCodecContext->bits_per_coded_sample == 0)
-     m_pCodecContext->bits_per_coded_sample = 16;
- 
-@@ -206,12 +202,19 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-   /* output audio will be packed */
-   int outputSize = av_samples_get_buffer_size(&outLineSize, m_pCodecContext->channels, m_pFrame1->nb_samples, m_desiredSampleFormat, 1);
- 
-+  if (!m_bNoConcatenate && m_iBufferOutputUsed && (int)m_frameSize != outputSize)
-+  {
-+    CLog::Log(LOGERROR, "COMXAudioCodecOMX::GetData Unexpected change of size (%d->%d)", m_frameSize, outputSize);
-+    m_bNoConcatenate = true;
-+  }
-+
-   // if this buffer won't fit then flush out what we have
-   int desired_size = AUDIO_DECODE_OUTPUT_BUFFER * (m_pCodecContext->channels * GetBitsPerSample()) >> (rounded_up_channels_shift[m_pCodecContext->channels] + 4);
-   if (m_iBufferOutputUsed && (m_iBufferOutputUsed + outputSize > desired_size || m_bNoConcatenate))
-   {
-      int ret = m_iBufferOutputUsed;
-      m_iBufferOutputUsed = 0;
-+     m_bNoConcatenate = false;
-      dts = m_dts;
-      pts = m_pts;
-      *dst = m_pBufferOutput;
-@@ -278,9 +281,6 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-     m_bFirstFrame = false;
-   }
-   m_iBufferOutputUsed += outputSize;
--
--  if (!m_bNoConcatenate && m_pCodecContext->sample_fmt == AV_SAMPLE_FMT_FLTP && m_frameSize && (int)m_frameSize != outputSize)
--    CLog::Log(LOGERROR, "COMXAudioCodecOMX::GetData Unexpected change of size (%d->%d)", m_frameSize, outputSize);
-   return 0;
- }
- 
-
-From df922f986ab0b2dc1363224ef6c72a7a8ac616dc Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 4 Feb 2016 15:29:55 +0000
-Subject: [PATCH 82/93] MMALCodec: Set dropped flag on output pictures when
- input requested that
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 6 ++++++
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   | 1 +
- 2 files changed, 7 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 0f5c1b7..7b025fd 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -759,6 +759,7 @@ void CMMALVideo::SetDropState(bool bDrop)
- {
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-     CLog::Log(LOGDEBUG, "%s::%s - bDrop(%d)", CLASSNAME, __func__, bDrop);
-+  m_dropState = bDrop;
- }
- 
- int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-@@ -794,6 +795,8 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-        buffer->length = (uint32_t)iSize > buffer->alloc_size ? buffer->alloc_size : (uint32_t)iSize;
-        // set a flag so we can identify primary frames from generated frames (deinterlace)
-        buffer->flags = MMAL_BUFFER_HEADER_FLAG_USER0;
-+       if (m_dropState)
-+         buffer->flags |= MMAL_BUFFER_HEADER_FLAG_USER3;
- 
-        memcpy(buffer->data, pData, buffer->length);
-        iSize -= buffer->length;
-@@ -944,6 +947,7 @@ void CMMALVideo::Reset(void)
-   m_decoderPts = DVD_NOPTS_VALUE;
-   m_demuxerPts = DVD_NOPTS_VALUE;
-   m_codecControlFlags = 0;
-+  m_dropState = false;
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-@@ -1021,6 +1025,8 @@ bool CMMALVideo::GetPicture(DVDVideoPicture* pDvdVideoPicture)
- 
-     pDvdVideoPicture->MMALBuffer->Acquire();
-     pDvdVideoPicture->iFlags  = DVP_FLAG_ALLOCATED;
-+    if (buffer->mmal_buffer->flags & MMAL_BUFFER_HEADER_FLAG_USER3)
-+      pDvdVideoPicture->iFlags |= DVP_FLAG_DROPPED;
-     if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-       CLog::Log(LOGINFO, "%s::%s dts:%.3f pts:%.3f flags:%x:%x MMALBuffer:%p mmal_buffer:%p", CLASSNAME, __func__,
-           pDvdVideoPicture->dts == DVD_NOPTS_VALUE ? 0.0 : pDvdVideoPicture->dts*1e-6, pDvdVideoPicture->pts == DVD_NOPTS_VALUE ? 0.0 : pDvdVideoPicture->pts*1e-6,
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index ca28c6f..bf669e0 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -124,6 +124,7 @@ protected:
-   double            m_decoderPts;
-   int               m_speed;
-   int               m_codecControlFlags;
-+  bool              m_dropState;
- 
-   CCriticalSection m_sharedSection;
-   MMAL_COMPONENT_T *m_dec;
-
-From a6102bec84b610166da7448d80b853e5efd649a1 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 3 Feb 2016 21:35:01 +0000
-Subject: [PATCH 83/93] DVDVideoCodecFFmpeg: Enable refcounted frames
-
-Without this frames will get (deep) copied when deinterlace is set to automatic,
-but file is not deinterlaced.
-
-For 1080p24 that costs 150MB/s of memory bandwidth which is very expensive.
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-index c2f3287..64087f2 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-@@ -316,6 +316,10 @@ bool CDVDVideoCodecFFmpeg::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options
-       av_opt_set(m_pCodecContext, it->m_name.c_str(), it->m_value.c_str(), 0);
-   }
- 
-+  // If non-zero, the decoded audio and video frames returned from avcodec_decode_video2() are reference-counted and are valid indefinitely.
-+  // Without this frames will get (deep) copied when deinterlace is set to automatic, but file is not deinterlaced.
-+  m_pCodecContext->refcounted_frames = 1;
-+
-   if (avcodec_open2(m_pCodecContext, pCodec, NULL) < 0)
-   {
-     CLog::Log(LOGDEBUG,"CDVDVideoCodecFFmpeg::Open() Unable to open codec");
-
-From a301f546dcfa4bf1ceaa9737a60a835826d54fec Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 19 Feb 2016 13:45:23 +0000
-Subject: [PATCH 84/93] mmal: increase decode buffering a little to help harder
- MVC files
-
-PR8610 reduced buffering in codec which generally improved behaviour,
-but we have some reports of hard streams (like 3D BluRay) that now
-lag. The problem is when the codec's input buffer runs dry you
-waste useful decoder cycles. It seems adding another two frames of
-latency to decoder gets the performance back.
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 7b025fd..08f61fc 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -676,7 +676,7 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
- 
-   // limit number of callback structures in video_decode to reduce latency. Too low and video hangs.
-   // negative numbers have special meaning. -1=size of DPB -2=size of DPB+1
--  status = mmal_port_parameter_set_uint32(m_dec_input, MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS, -3);
-+  status = mmal_port_parameter_set_uint32(m_dec_input, MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS, -5);
-   if (status != MMAL_SUCCESS)
-     CLog::Log(LOGERROR, "%s::%s Failed to configure max num callbacks on %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
- 
-
-From c0b0aad15a9ffcd921bb70af5bf9200ee7a93fed Mon Sep 17 00:00:00 2001
-From: Mario Holzinger <sandman01xda@gmail.com>
-Date: Fri, 11 Dec 2015 16:48:57 +0100
-Subject: [PATCH 85/93] touch panel to display adjustment
-
----
- xbmc/input/linux/LinuxInputDevices.cpp |  7 ++++---
- xbmc/settings/AdvancedSettings.cpp     | 16 ++++++++++++++++
- xbmc/settings/AdvancedSettings.h       |  6 ++++++
- 3 files changed, 26 insertions(+), 3 deletions(-)
-
-diff --git a/xbmc/input/linux/LinuxInputDevices.cpp b/xbmc/input/linux/LinuxInputDevices.cpp
-index 9f3e866..a506956 100644
---- a/xbmc/input/linux/LinuxInputDevices.cpp
-+++ b/xbmc/input/linux/LinuxInputDevices.cpp
-@@ -98,6 +98,7 @@ typedef unsigned long kernel_ulong_t;
- #include "utils/log.h"
- #include "input/touch/generic/GenericTouchActionHandler.h"
- #include "input/touch/generic/GenericTouchInputHandler.h"
-+#include "settings/AdvancedSettings.h"
- 
- #ifndef BITS_PER_LONG
- #define BITS_PER_LONG        (sizeof(long) * 8)
-@@ -653,13 +654,13 @@ bool CLinuxInputDevice::AbsEvent(const struct input_event& levt, XBMC_Event& dev
-   switch (levt.code)
-   {
-   case ABS_X:
--    m_mouseX = levt.value;
-+    m_mouseX = (int)((float)levt.value * g_advancedSettings.m_screenAlign_xStretchFactor) + g_advancedSettings.m_screenAlign_xOffset; // stretch and shift touch x coordinates
-     break;
- 
-   case ABS_Y:
--    m_mouseY = levt.value;
-+    m_mouseY = (int)((float)levt.value * g_advancedSettings.m_screenAlign_yStretchFactor) + g_advancedSettings.m_screenAlign_yOffset; // stretch and shift touch y coordinates
-     break;
--  
-+
-   case ABS_MISC:
-     remoteStatus = levt.value & 0xFF;
-     break;
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index aeea13b..326e9f5 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -303,6 +303,12 @@ void CAdvancedSettings::Initialize()
-   m_iEdlCommBreakAutowait = 0;             // Off by default
-   m_iEdlCommBreakAutowind = 0;             // Off by default
- 
-+  // Touchscreen  default values if no adjustment is necessarry
-+  m_screenAlign_xOffset = 0;
-+  m_screenAlign_yOffset= 0;
-+  m_screenAlign_xStretchFactor = 1.0;
-+  m_screenAlign_yStretchFactor = 1.0;
-+
-   m_curlconnecttimeout = 10;
-   m_curllowspeedtime = 20;
-   m_curlretries = 2;
-@@ -871,6 +877,16 @@ void CAdvancedSettings::ParseSettingsFile(const std::string &file)
-     XMLUtils::GetInt(pElement, "commbreakautowind", m_iEdlCommBreakAutowind, 0, 10);        // Between 0 and 10 seconds
-   }
- 
-+  // Touchscreen
-+  pElement = pRootElement->FirstChildElement("touchscreen");
-+  if (pElement)
-+  {
-+    XMLUtils::GetInt(pElement, "x_offset", m_screenAlign_xOffset );
-+    XMLUtils::GetInt(pElement, "y_offset", m_screenAlign_yOffset );
-+    XMLUtils::GetFloat(pElement, "x_stretch_factor", m_screenAlign_xStretchFactor );
-+    XMLUtils::GetFloat(pElement, "y_stretch_factor", m_screenAlign_yStretchFactor );
-+  }
-+
-   // picture exclude regexps
-   TiXmlElement* pPictureExcludes = pRootElement->FirstChildElement("pictureexcludes");
-   if (pPictureExcludes)
-diff --git a/xbmc/settings/AdvancedSettings.h b/xbmc/settings/AdvancedSettings.h
-index 4da88f4..1bdc77e 100644
---- a/xbmc/settings/AdvancedSettings.h
-+++ b/xbmc/settings/AdvancedSettings.h
-@@ -334,6 +334,12 @@ class CAdvancedSettings : public ISettingCallback, public ISettingsHandler
-     std::string m_cpuTempCmd;
-     std::string m_gpuTempCmd;
- 
-+    // Touchscreen
-+    int m_screenAlign_xOffset;
-+    int m_screenAlign_yOffset;
-+    float m_screenAlign_xStretchFactor;
-+    float m_screenAlign_yStretchFactor;
-+
-     /* PVR/TV related advanced settings */
-     int m_iPVRTimeCorrection;     /*!< @brief correct all times (epg tags, timer tags, recording tags) by this amount of minutes. defaults to 0. */
-     int m_iPVRInfoToggleInterval; /*!< @brief if there are more than 1 pvr gui info item available (e.g. multiple recordings active at the same time), use this toggle delay in milliseconds. defaults to 3000. */
-
-From d3c755950fe7e7b255a2a28cafd105affb3aab13 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 15 Feb 2016 15:51:11 +0000
-Subject: [PATCH 86/93] touch panel to display adjustment - support multitouch
-
-PR7978 allowed a simple calibration to be applied to touch input
-It didn't cover multitouch which this adds support for
----
- xbmc/input/linux/LinuxInputDevices.cpp | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/xbmc/input/linux/LinuxInputDevices.cpp b/xbmc/input/linux/LinuxInputDevices.cpp
-index a506956..3579cd0 100644
---- a/xbmc/input/linux/LinuxInputDevices.cpp
-+++ b/xbmc/input/linux/LinuxInputDevices.cpp
-@@ -707,7 +707,7 @@ bool CLinuxInputDevice::mtAbsEvent(const struct input_event& levt)
-   case ABS_MT_POSITION_X:
-     if (m_mt_currentSlot < TOUCH_MAX_POINTERS)
-     {
--      m_mt_x[m_mt_currentSlot] = levt.value;
-+      m_mt_x[m_mt_currentSlot] = (int)((float)levt.value * g_advancedSettings.m_screenAlign_xStretchFactor) + g_advancedSettings.m_screenAlign_xOffset; // stretch and shift touch x coordinates
-       if (m_mt_event[m_mt_currentSlot] == TouchInputUnchanged)
-         m_mt_event[m_mt_currentSlot] = TouchInputMove;
-     }
-@@ -716,7 +716,7 @@ bool CLinuxInputDevice::mtAbsEvent(const struct input_event& levt)
-   case ABS_MT_POSITION_Y:
-     if (m_mt_currentSlot < TOUCH_MAX_POINTERS)
-     {
--      m_mt_y[m_mt_currentSlot] = levt.value;
-+      m_mt_y[m_mt_currentSlot] = (int)((float)levt.value * g_advancedSettings.m_screenAlign_yStretchFactor) + g_advancedSettings.m_screenAlign_yOffset; // stretch and shift touch y coordinates;
-       if (m_mt_event[m_mt_currentSlot] == TouchInputUnchanged)
-         m_mt_event[m_mt_currentSlot] = TouchInputMove;
-     }
-
-From f517a6ff4ab7f04b9a6ba371d3429e5ae95eb3d1 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 22 Mar 2016 22:28:13 +0000
-Subject: [PATCH 87/93] [linux] Move hotplug checking into its own thread
-
-Currently checking for new linux input devices is called from the rendering thread.
-We've been getting reports of skipped frames on raspberry pi.
-
-Specifically if eventlirc is active and you have an LIRC capable device connected
-the hotplug check is slow and you get a frame skip every ten seconds.
-
-So move this code into its own thread
----
- xbmc/input/linux/LinuxInputDevices.cpp | 54 +++++++++++++++++++++-------------
- xbmc/input/linux/LinuxInputDevices.h   | 14 ++++++++-
- xbmc/windowing/WinEventsLinux.cpp      |  1 +
- xbmc/windowing/WinEventsLinux.h        |  2 ++
- 4 files changed, 49 insertions(+), 22 deletions(-)
-
-diff --git a/xbmc/input/linux/LinuxInputDevices.cpp b/xbmc/input/linux/LinuxInputDevices.cpp
-index 3579cd0..7b3c6ad 100644
---- a/xbmc/input/linux/LinuxInputDevices.cpp
-+++ b/xbmc/input/linux/LinuxInputDevices.cpp
-@@ -1081,6 +1081,30 @@ bool CLinuxInputDevice::IsUnplugged()
-   return m_bUnplugged;
- }
- 
-+CLinuxInputDevicesCheckHotplugged::CLinuxInputDevicesCheckHotplugged(CLinuxInputDevices &parent) :
-+    CThread("CLinuxInputDevicesCheckHotplugged"), m_parent(parent)
-+{
-+  Create();
-+  SetPriority(THREAD_PRIORITY_BELOW_NORMAL);
-+}
-+
-+CLinuxInputDevicesCheckHotplugged::~CLinuxInputDevicesCheckHotplugged()
-+{
-+  m_bStop = true;
-+  m_quitEvent.Set();
-+  StopThread(true);
-+}
-+
-+void CLinuxInputDevicesCheckHotplugged::Process()
-+{
-+  while (!m_bStop)
-+  {
-+    m_parent.CheckHotplugged();
-+    // every ten seconds
-+    m_quitEvent.WaitMSec(10000);
-+  }
-+}
-+
- bool CLinuxInputDevices::CheckDevice(const char *device)
- {
-   int fd;
-@@ -1147,10 +1171,6 @@ void CLinuxInputDevices::InitAvailable()
-  */
- void CLinuxInputDevices::CheckHotplugged()
- {
--  CSingleLock lock(m_devicesListLock);
--
--  int deviceId = m_devices.size();
--
-   /* No devices specified. Try to guess some. */
-   for (int i = 0; i < MAX_LINUX_INPUT_DEVICES; i++)
-   {
-@@ -1158,18 +1178,22 @@ void CLinuxInputDevices::CheckHotplugged()
-     bool ispresent = false;
- 
-     snprintf(buf, 32, "/dev/input/event%d", i);
--
--    for (size_t j = 0; j < m_devices.size(); j++)
-     {
--      if (m_devices[j]->GetFileName().compare(buf) == 0)
-+      CSingleLock lock(m_devicesListLock);
-+      for (size_t j = 0; j < m_devices.size(); j++)
-       {
--        ispresent = true;
--        break;
-+        if (m_devices[j]->GetFileName().compare(buf) == 0)
-+        {
-+          ispresent = true;
-+          break;
-+        }
-       }
-     }
- 
-     if (!ispresent && CheckDevice(buf))
-     {
-+      CSingleLock lock(m_devicesListLock);
-+      int deviceId = m_devices.size();
-       CLog::Log(LOGINFO, "Found input device %s", buf);
-       m_devices.push_back(new CLinuxInputDevice(buf, deviceId));
-       ++deviceId;
-@@ -1360,18 +1384,6 @@ XBMC_Event CLinuxInputDevices::ReadEvent()
-     InitAvailable();
-     m_bReInitialize = false;
-   }
--  else
--  {
--    time_t now;
--    time(&now);
--
--    if ((now - m_lastHotplugCheck) >= 10)
--    {
--      CheckHotplugged();
--      m_lastHotplugCheck = now;
--    }
--  }
--
-   CSingleLock lock(m_devicesListLock);
- 
-   XBMC_Event event;
-diff --git a/xbmc/input/linux/LinuxInputDevices.h b/xbmc/input/linux/LinuxInputDevices.h
-index 8c88a1d..4fde1aa 100644
---- a/xbmc/input/linux/LinuxInputDevices.h
-+++ b/xbmc/input/linux/LinuxInputDevices.h
-@@ -28,6 +28,7 @@
- #include "threads/SingleLock.h"
- #include "input/touch/ITouchInputHandler.h"
- #include "input/touch/generic/IGenericTouchGestureDetector.h"
-+#include "threads/Thread.h"
- 
- struct KeymapEntry
- {
-@@ -105,7 +106,18 @@ private:
-   bool CheckDevice(const char *device);
-   std::vector<CLinuxInputDevice*> m_devices;
-   bool m_bReInitialize;
--  time_t m_lastHotplugCheck;
-+};
-+
-+class CLinuxInputDevicesCheckHotplugged : protected CThread
-+{
-+public:
-+  CLinuxInputDevicesCheckHotplugged(CLinuxInputDevices &parent);
-+  ~CLinuxInputDevicesCheckHotplugged();
-+private:
-+  CLinuxInputDevices &m_parent;
-+  CEvent m_quitEvent;
-+protected:
-+  virtual void Process();
- };
- 
- #endif /* LINUXINPUTDEVICES_H_ */
-diff --git a/xbmc/windowing/WinEventsLinux.cpp b/xbmc/windowing/WinEventsLinux.cpp
-index 2b3d77a..fb6c987 100644
---- a/xbmc/windowing/WinEventsLinux.cpp
-+++ b/xbmc/windowing/WinEventsLinux.cpp
-@@ -134,6 +134,7 @@ bool CWinEventsLinux::MessagePump()
-   if (!m_initialized)
-   {
-     m_devices.InitAvailable();
-+    m_checkHotplug = std::unique_ptr<CLinuxInputDevicesCheckHotplugged>(new CLinuxInputDevicesCheckHotplugged(m_devices));
-     m_initialized = true;
- #ifdef TARGET_RASPBERRY_PI
-     LoadXML("Pointer.xml");
-diff --git a/xbmc/windowing/WinEventsLinux.h b/xbmc/windowing/WinEventsLinux.h
-index 23244a2..c82ba84 100644
---- a/xbmc/windowing/WinEventsLinux.h
-+++ b/xbmc/windowing/WinEventsLinux.h
-@@ -22,6 +22,7 @@
- #define WINDOW_EVENTS_LINUX_H
- 
- #pragma once
-+#include <memory>
- #include "windowing/WinEvents.h"
- #include "input/linux/LinuxInputDevices.h"
- #include "guilib/TextureManager.h"
-@@ -44,6 +45,7 @@ public:
- private:
-   static bool m_initialized;
-   static CLinuxInputDevices m_devices;
-+  std::unique_ptr<CLinuxInputDevicesCheckHotplugged> m_checkHotplug;
- #ifdef TARGET_RASPBERRY_PI
-   bool LoadXML(const std::string strFileName);
-   int64_t m_last_mouse_move_time;
-
-From ffb8b5378dbb2c53f1411e051f0c7eec9555ca83 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 28 Jul 2015 17:47:33 +0100
-Subject: [PATCH 88/93] [rbp] Refactor the vsync handle to support multiple
- callers
-
----
- xbmc/linux/RBP.cpp | 100 ++++++++++++++++++++++++++++++++++-------------------
- xbmc/linux/RBP.h   |  10 ++++--
- 2 files changed, 73 insertions(+), 37 deletions(-)
-
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index 13b0504..ddc2b9c 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -34,6 +34,7 @@
- #include <sys/ioctl.h>
- #include <linux/ioctl.h>
- #include "rpi_user_vcsm.h"
-+#include "utils/TimeUtils.h"
- 
- #define MAJOR_NUM 100
- #define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-@@ -56,6 +57,8 @@ CRBP::CRBP()
-   m_enabled = 0;
-   m_mb = mbox_open();
-   vcsm_init();
-+  m_vsync_count = 0;
-+  m_last_vsync = 0;
- }
- 
- CRBP::~CRBP()
-@@ -73,7 +76,7 @@ void CRBP::InitializeSettings()
- 
- bool CRBP::Initialize()
- {
--  CSingleLock lock (m_critSection);
-+  CSingleLock lock(m_critSection);
-   if (m_initialized)
-     return true;
- 
-@@ -133,11 +136,62 @@ void CRBP::LogFirmwareVerison()
-   CLog::Log(LOGNOTICE, "Config:\n%s", response);
- }
- 
-+static void vsync_callback_static(DISPMANX_UPDATE_HANDLE_T u, void *arg)
-+{
-+  CRBP *rbp = reinterpret_cast<CRBP*>(arg);
-+  rbp->VSyncCallback();
-+}
-+
-+void CRBP::VSyncCallback()
-+{
-+  CSingleLock lock(m_vsync_lock);
-+  m_vsync_count++;
-+  m_last_vsync = CurrentHostCounter();
-+  m_vsync_cond.notifyAll();
-+}
-+
-+unsigned int CRBP::VsyncCount()
-+{
-+  CSingleLock lock(m_vsync_lock);
-+  return m_vsync_count;
-+}
-+
-+int64_t CRBP::LastVsync()
-+{
-+  CSingleLock lock(m_vsync_lock);
-+  return m_last_vsync;
-+}
-+
-+unsigned int CRBP::WaitVsync(unsigned int target)
-+{
-+  CSingleLock vlock(m_vsync_lock);
-+  DISPMANX_DISPLAY_HANDLE_T display = m_display;
-+  XbmcThreads::EndTime delay(50);
-+  if (target == ~0U)
-+    target = m_vsync_count+1;
-+  while (!delay.IsTimePast())
-+  {
-+    CSingleLock lock(m_critSection);
-+    if (m_vsync_count >= target)
-+      break;
-+    lock.Leave();
-+    if (!m_vsync_cond.wait(vlock, delay.MillisLeft()))
-+      break;
-+  }
-+  if (m_vsync_count < target)
-+    CLog::Log(LOGDEBUG, "CRBP::%s no  vsync %d/%d display:%x(%x) delay:%d", __FUNCTION__, m_vsync_count, target, m_display, display, delay.MillisLeft());
-+
-+  return m_vsync_count;
-+}
-+
- DISPMANX_DISPLAY_HANDLE_T CRBP::OpenDisplay(uint32_t device)
- {
-+  CSingleLock lock(m_critSection);
-   if (m_display == DISPMANX_NO_HANDLE)
-   {
-     m_display = vc_dispmanx_display_open( 0 /*screen*/ );
-+    int s = vc_dispmanx_vsync_callback(m_display, vsync_callback_static, (void *)this);
-+    assert(s == 0);
-     init_cursor();
-   }
-   return m_display;
-@@ -145,16 +199,20 @@ DISPMANX_DISPLAY_HANDLE_T CRBP::OpenDisplay(uint32_t device)
- 
- void CRBP::CloseDisplay(DISPMANX_DISPLAY_HANDLE_T display)
- {
-+  CSingleLock lock(m_critSection);
-   assert(display == m_display);
-+  int s = vc_dispmanx_vsync_callback(m_display, NULL, NULL);
-+  assert(s == 0);
-+  uninit_cursor();
-   vc_dispmanx_display_close(m_display);
-   m_display = DISPMANX_NO_HANDLE;
--  uninit_cursor();
- }
- 
- void CRBP::GetDisplaySize(int &width, int &height)
- {
-+  CSingleLock lock(m_critSection);
-   DISPMANX_MODEINFO_T info;
--  if (vc_dispmanx_display_get_info(m_display, &info) == 0)
-+  if (m_display != DISPMANX_NO_HANDLE && vc_dispmanx_display_get_info(m_display, &info) == 0)
-   {
-     width = info.width;
-     height = info.height;
-@@ -183,13 +241,13 @@ unsigned char *CRBP::CaptureDisplay(int width, int height, int *pstride, bool sw
-     flags |= DISPMANX_SNAPSHOT_PACK;
- 
-   stride = ((width + 15) & ~15) * 4;
--  image = new unsigned char [height * stride];
- 
--  if (image)
-+  CSingleLock lock(m_critSection);
-+  if (m_display != DISPMANX_NO_HANDLE)
-   {
-+    image = new unsigned char [height * stride];
-     resource = vc_dispmanx_resource_create( VC_IMAGE_RGBA32, width, height, &vc_image_ptr );
- 
--    assert(m_display != DISPMANX_NO_HANDLE);
-     vc_dispmanx_snapshot(m_display, resource, (DISPMANX_TRANSFORM_T)flags);
- 
-     vc_dispmanx_rect_set(&rect, 0, 0, width, height);
-@@ -201,35 +259,6 @@ unsigned char *CRBP::CaptureDisplay(int width, int height, int *pstride, bool sw
-   return image;
- }
- 
--
--static void vsync_callback(DISPMANX_UPDATE_HANDLE_T u, void *arg)
--{
--  CEvent *sync = (CEvent *)arg;
--  sync->Set();
--}
--
--void CRBP::WaitVsync()
--{
--  int s;
--  DISPMANX_DISPLAY_HANDLE_T m_display = vc_dispmanx_display_open( 0 /*screen*/ );
--  if (m_display == DISPMANX_NO_HANDLE)
--  {
--    CLog::Log(LOGDEBUG, "CRBP::%s skipping while display closed", __func__);
--    return;
--  }
--  m_vsync.Reset();
--  s = vc_dispmanx_vsync_callback(m_display, vsync_callback, (void *)&m_vsync);
--  if (s == 0)
--  {
--    m_vsync.WaitMSec(1000);
--  }
--  else assert(0);
--  s = vc_dispmanx_vsync_callback(m_display, NULL, NULL);
--  assert(s == 0);
--  vc_dispmanx_display_close( m_display );
--}
--
--
- void CRBP::Deinitialize()
- {
-   if (m_omx_image_init)
-@@ -260,6 +289,7 @@ void CRBP::Deinitialize()
- 
- double CRBP::AdjustHDMIClock(double adjust)
- {
-+  CSingleLock lock(m_critSection);
-   char response[80];
-   vc_gencmd(response, sizeof response, "hdmi_adjust_clock %f", adjust);
-   char *p = strchr(response, '=');
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index 2eee35d..3b59cd9 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -77,7 +77,10 @@ public:
-   // stride can be null for packed output
-   unsigned char *CaptureDisplay(int width, int height, int *stride, bool swap_red_blue, bool video_only = true);
-   DllOMX *GetDllOMX() { return m_OMX ? m_OMX->GetDll() : NULL; }
--  void WaitVsync();
-+  unsigned int WaitVsync(unsigned int target = ~0U);
-+  int64_t LastVsync();
-+  unsigned int VsyncCount();
-+  void VSyncCallback();
-   double AdjustHDMIClock(double adjust);
-   double GetAdjustHDMIClock() { return m_last_pll_adjust; }
-   int GetMBox() { return m_mb; }
-@@ -94,7 +97,10 @@ private:
-   bool       m_codec_wvc1_enabled;
-   COMXCore   *m_OMX;
-   DISPMANX_DISPLAY_HANDLE_T m_display;
--  CEvent     m_vsync;
-+  CCriticalSection m_vsync_lock;
-+  XbmcThreads::ConditionVariable m_vsync_cond;
-+  unsigned int m_vsync_count;
-+  int64_t m_last_vsync;
-   class DllLibOMXCore;
-   CCriticalSection m_critSection;
-   double m_last_pll_adjust;
-
-From 2e13233a89859c10902059dd34160582af62ee1e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 24 Mar 2016 23:24:18 +0000
-Subject: [PATCH 89/93] [mmalrenderer] Wait for vsync before submitting to mmal
- when display sync is disabled
-
-This avoids an issue where video occasionally goes stuttery after a seek, until the next pause/play or seek.
-The issue is when display sync is disabled, and framerate of video matches display, and render times are coincident with vsync
-you find that depending on timestamp/scheduling jitter, you may or may not get an update each vsync resulting in stuttery video.
-
-Some scheme to force render times to be dependent on vsync is required. We do this by using a queue that is popped following vsyncs.
-We ensure the queue always has 1 or 2 frames so it doesn't underrun with a late frame, but this adds a frame of latency.
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp | 66 ++++++++++++++++++++++++++++--
- xbmc/cores/VideoRenderers/MMALRenderer.h   |  6 ++-
- 2 files changed, 68 insertions(+), 4 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index ad3f66f..76c4682 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -239,7 +239,7 @@ bool CMMALRenderer::init_vout(ERenderFormat format)
-   return true;
- }
- 
--CMMALRenderer::CMMALRenderer()
-+CMMALRenderer::CMMALRenderer() : CThread("MMALRenderer")
- {
-   CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
-   m_vout = NULL;
-@@ -253,15 +253,69 @@ CMMALRenderer::CMMALRenderer()
-   m_iYV12RenderBuffer = 0;
-   m_inflight = 0;
-   m_sharpness = -2.0f;
-+  m_queue = mmal_queue_create();
-+  Create();
- }
- 
- CMMALRenderer::~CMMALRenderer()
- {
-   CSingleLock lock(m_sharedSection);
-   CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
-+  StopThread(true);
-+  mmal_queue_destroy(m_queue);
-   UnInit();
- }
- 
-+void CMMALRenderer::Process()
-+{
-+  SetPriority(THREAD_PRIORITY_ABOVE_NORMAL);
-+  while (!m_bStop)
-+  {
-+    g_RBP.WaitVsync();
-+    double dfps = g_graphicsContext.GetFPS();
-+    if (dfps <= 0.0)
-+      dfps = m_fps;
-+    // This algorithm is basically making the decision according to Bresenham's line algorithm.  Imagine drawing a line where x-axis is display frames, and y-axis is video frames
-+    m_error += m_fps / dfps;
-+    // we may need to discard frames if queue length gets too high or video frame rate is above display frame rate
-+    while (mmal_queue_length(m_queue) > 2 || m_error > 1.0)
-+    {
-+      if (m_error > 1.0)
-+        m_error -= 1.0;
-+      MMAL_BUFFER_HEADER_T *buffer = mmal_queue_get(m_queue);
-+      if (buffer)
-+      {
-+        if (m_format == RENDER_FMT_MMAL)
-+        {
-+          CMMALVideoBuffer *omvb = (CMMALVideoBuffer *)buffer->user_data;
-+          assert(buffer == omvb->mmal_buffer);
-+          m_inflight--;
-+          omvb->Release();
-+        }
-+        else if (m_format == RENDER_FMT_YUV420P)
-+        {
-+          CYUVVideoBuffer *omvb = (CYUVVideoBuffer *)buffer->user_data;
-+          assert(buffer == omvb->mmal_buffer);
-+          m_inflight--;
-+          omvb->Release();
-+        }
-+        if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+          CLog::Log(LOGDEBUG, "%s::%s - discard buffer:%p vsync:%d queue:%d diff:%f", CLASSNAME, __func__, buffer, g_RBP.VsyncCount(), mmal_queue_length(m_queue), m_error);
-+      }
-+    }
-+    // this is case where we would like to display a new frame
-+    if (m_error > 0.0)
-+    {
-+      m_error -= 1.0;
-+      MMAL_BUFFER_HEADER_T *buffer = mmal_queue_get(m_queue);
-+      if (buffer)
-+        mmal_port_send_buffer(m_vout_input, buffer);
-+      if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+        CLog::Log(LOGDEBUG, "%s::%s - buffer:%p vsync:%d queue:%d diff:%f", CLASSNAME, __func__, buffer, g_RBP.VsyncCount(), mmal_queue_length(m_queue), m_error);
-+    }
-+  }
-+}
-+
- void CMMALRenderer::AddProcessor(CMMALVideoBuffer *buffer, int index)
- {
- #if defined(MMAL_DEBUG_VERBOSE)
-@@ -496,7 +550,10 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-         return;
-       omvb->Acquire();
-       omvb->mmal_buffer->flags |= MMAL_BUFFER_HEADER_FLAG_USER1 | MMAL_BUFFER_HEADER_FLAG_USER2;
--      mmal_port_send_buffer(m_vout_input, omvb->mmal_buffer);
-+      if (!CSettings::GetInstance().GetBool("videoplayer.usedisplayasclock"))
-+        mmal_queue_put(m_queue, omvb->mmal_buffer);
-+      else
-+        mmal_port_send_buffer(m_vout_input, omvb->mmal_buffer);
-     }
-     else
-       CLog::Log(LOGDEBUG, "%s::%s - No buffer to update", CLASSNAME, __func__);
-@@ -516,7 +573,10 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-       omvb->Acquire();
-       omvb->mmal_buffer->flags |= MMAL_BUFFER_HEADER_FLAG_USER1 | MMAL_BUFFER_HEADER_FLAG_USER2;
-       omvb->mmal_buffer->user_data = omvb;
--      mmal_port_send_buffer(m_vout_input, omvb->mmal_buffer);
-+      if (!CSettings::GetInstance().GetBool("videoplayer.usedisplayasclock"))
-+        mmal_queue_put(m_queue, omvb->mmal_buffer);
-+      else
-+        mmal_port_send_buffer(m_vout_input, omvb->mmal_buffer);
-     }
-     else
-       CLog::Log(LOGDEBUG, "%s::%s - No buffer to update: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.h b/xbmc/cores/VideoRenderers/MMALRenderer.h
-index a71e645..34cb294 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.h
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.h
-@@ -29,6 +29,7 @@
- #include "cores/dvdplayer/DVDStreamInfo.h"
- #include "guilib/Geometry.h"
- #include "BaseRenderer.h"
-+#include "threads/Thread.h"
- 
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/util/mmal_util.h>
-@@ -55,7 +56,7 @@ protected:
-   long m_refs;
- };
- 
--class CMMALRenderer : public CBaseRenderer
-+class CMMALRenderer : public CBaseRenderer, public CThread
- {
-   struct YUVBUFFER
-   {
-@@ -66,6 +67,7 @@ public:
-   CMMALRenderer();
-   ~CMMALRenderer();
- 
-+  void Process();
-   virtual void Update();
-   virtual void SetupScreenshot() {};
- 
-@@ -125,6 +127,8 @@ protected:
-   MMAL_COMPONENT_T *m_vout;
-   MMAL_PORT_T *m_vout_input;
-   MMAL_POOL_T *m_vout_input_pool;
-+  MMAL_QUEUE_T *m_queue;
-+  double m_error;
- 
-   bool init_vout(ERenderFormat format);
-   void ReleaseBuffers();
-
-From 1f63176ba9c91a1f1e58dec440a56e90ee944583 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 23 Mar 2016 16:57:19 +0000
-Subject: [PATCH 90/93] mmal: Include mmal renderer logging in video category
-
-On Pi the decoder and renderer are closely coupled so combining the
-logging category makes sense to me.
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp | 109 +++++++++++------------------
- 1 file changed, 42 insertions(+), 67 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index 76c4682..e1099da 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -36,10 +36,6 @@
- 
- #define CLASSNAME "CMMALRenderer"
- 
--#ifdef _DEBUG
--#define MMAL_DEBUG_VERBOSE
--#endif
--
- 
- CYUVVideoBuffer::CYUVVideoBuffer()
- {
-@@ -56,9 +52,8 @@ CYUVVideoBuffer::~CYUVVideoBuffer()
- CYUVVideoBuffer *CYUVVideoBuffer::Acquire()
- {
-   long count = AtomicIncrement(&m_refs);
--#ifdef MMAL_DEBUG_VERBOSE
--  CLog::Log(LOGDEBUG, "%s::%s omvb:%p mmal:%p ref:%ld", CLASSNAME, __func__, this, mmal_buffer, count);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s omvb:%p mmal:%p ref:%ld", CLASSNAME, __func__, this, mmal_buffer, count);
-   (void)count;
-   return this;
- }
-@@ -66,9 +61,8 @@ CYUVVideoBuffer *CYUVVideoBuffer::Acquire()
- long CYUVVideoBuffer::Release()
- {
-   long count = AtomicDecrement(&m_refs);
--#ifdef MMAL_DEBUG_VERBOSE
--  CLog::Log(LOGDEBUG, "%s::%s omvb:%p mmal:%p ref:%ld", CLASSNAME, __func__, this, mmal_buffer, count);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s omvb:%p mmal:%p ref:%ld", CLASSNAME, __func__, this, mmal_buffer, count);
-   if (count == 0)
-   {
-     mmal_buffer_header_release(mmal_buffer);
-@@ -88,9 +82,8 @@ CRenderInfo CMMALRenderer::GetRenderInfo()
-   if (!m_bMMALConfigured)
-     m_bMMALConfigured = init_vout(RENDER_FMT_MMAL);
- 
--  #if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s cookie:%p", CLASSNAME, __func__, (void *)m_vout_input_pool);
--  #endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s cookie:%p", CLASSNAME, __func__, (void *)m_vout_input_pool);
- 
-   info.max_buffer_size = NUM_BUFFERS;
-   info.optimal_buffer_size = NUM_BUFFERS;
-@@ -112,18 +105,16 @@ void CMMALRenderer::vout_input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *
-   {
-     CMMALVideoBuffer *omvb = (CMMALVideoBuffer *)buffer->user_data;
-     assert(buffer == omvb->mmal_buffer);
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s port:%p omvb:%p mmal:%p len:%d cmd:%x flags:%x flight:%d", CLASSNAME, __func__, port, omvb, omvb->mmal_buffer, buffer->length, buffer->cmd, buffer->flags, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s port:%p omvb:%p mmal:%p len:%d cmd:%x flags:%x flight:%d", CLASSNAME, __func__, port, omvb, omvb->mmal_buffer, buffer->length, buffer->cmd, buffer->flags, m_inflight);
-     omvb->Release();
-   }
-   else if (m_format == RENDER_FMT_YUV420P)
-   {
-     CYUVVideoBuffer *omvb = (CYUVVideoBuffer *)buffer->user_data;
-     assert(buffer == omvb->mmal_buffer);
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s port:%p omvb:%p mmal:%p len:%d cmd:%x flags:%x flight:%d", CLASSNAME, __func__, port, omvb, omvb->mmal_buffer, buffer->length, buffer->cmd, buffer->flags, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s port:%p omvb:%p mmal:%p len:%d cmd:%x flags:%x flight:%d", CLASSNAME, __func__, port, omvb, omvb->mmal_buffer, buffer->length, buffer->cmd, buffer->flags, m_inflight);
-     m_inflight--;
-     omvb->Release();
-   }
-@@ -318,9 +309,8 @@ void CMMALRenderer::Process()
- 
- void CMMALRenderer::AddProcessor(CMMALVideoBuffer *buffer, int index)
- {
--#if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s - %p (%p) %i", CLASSNAME, __func__, buffer, buffer->mmal_buffer, index);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s - %p (%p) %i", CLASSNAME, __func__, buffer, buffer->mmal_buffer, index);
- 
-   YUVBUFFER &buf = m_buffers[index];
-   assert(!buf.MMALBuffer);
-@@ -372,17 +362,15 @@ int CMMALRenderer::GetImage(YV12Image *image, int source, bool readonly)
- {
-   if (!image || source < 0)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - invalid: image:%p source:%d ro:%d flight:%d", CLASSNAME, __func__, image, source, readonly, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - invalid: image:%p source:%d ro:%d flight:%d", CLASSNAME, __func__, image, source, readonly, m_inflight);
-     return -1;
-   }
- 
-   if (m_format == RENDER_FMT_MMAL)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - MMAL: image:%p source:%d ro:%d flight:%d", CLASSNAME, __func__, image, source, readonly, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - MMAL: image:%p source:%d ro:%d flight:%d", CLASSNAME, __func__, image, source, readonly, m_inflight);
-   }
-   else if (m_format == RENDER_FMT_YUV420P)
-   {
-@@ -425,9 +413,8 @@ int CMMALRenderer::GetImage(YV12Image *image, int source, bool readonly)
-     if (!buf.YUVBuffer)
-       return -1;
-     buf.YUVBuffer->mmal_buffer = buffer;
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - YUV: image:%p source:%d ro:%d omvb:%p mmal:%p flight:%d", CLASSNAME, __func__, image, source, readonly, buf.YUVBuffer, buffer, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - YUV: image:%p source:%d ro:%d omvb:%p mmal:%p flight:%d", CLASSNAME, __func__, image, source, readonly, buf.YUVBuffer, buffer, m_inflight);
-     buf.YUVBuffer->Acquire();
-   }
-   else assert(0);
-@@ -440,16 +427,14 @@ void CMMALRenderer::ReleaseBuffer(int idx)
-   CSingleLock lock(m_sharedSection);
-   if (!m_bMMALConfigured)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - not configured: source:%d", CLASSNAME, __func__, idx);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - not configured: source:%d", CLASSNAME, __func__, idx);
-     return;
-   }
-   if (m_format == RENDER_FMT_BYPASS)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - bypass: source:%d", CLASSNAME, __func__, idx);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - bypass: source:%d", CLASSNAME, __func__, idx);
-     return;
-   }
- 
-@@ -457,17 +442,15 @@ void CMMALRenderer::ReleaseBuffer(int idx)
-   if (m_format == RENDER_FMT_MMAL)
-   {
-     CMMALVideoBuffer *omvb = buffer->MMALBuffer;
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - MMAL: source:%d omvb:%p mmal:%p", CLASSNAME, __func__, idx, omvb, omvb ? omvb->mmal_buffer:NULL);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - MMAL: source:%d omvb:%p mmal:%p", CLASSNAME, __func__, idx, omvb, omvb ? omvb->mmal_buffer:NULL);
-     SAFE_RELEASE(buffer->MMALBuffer);
-   }
-   else if (m_format == RENDER_FMT_YUV420P)
-   {
-     CYUVVideoBuffer *omvb = buffer->YUVBuffer;
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - YUV: source:%d omvb:%p mmal:%p flight:%d", CLASSNAME, __func__, idx, omvb, omvb ? omvb->mmal_buffer:NULL, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - YUV: source:%d omvb:%p mmal:%p flight:%d", CLASSNAME, __func__, idx, omvb, omvb ? omvb->mmal_buffer:NULL, m_inflight);
-     if (omvb && omvb->mmal_buffer)
-       SAFE_RELEASE(buffer->YUVBuffer);
-   }
-@@ -491,9 +474,8 @@ void CMMALRenderer::Flush()
- 
- void CMMALRenderer::Update()
- {
--#if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
-   if (!m_bConfigured) return;
-   ManageDisplay();
- }
-@@ -505,9 +487,8 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
- 
-   if (!m_bConfigured)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - not configured: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - not configured: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
-     return;
-   }
- 
-@@ -529,9 +510,8 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
- 
-   if (m_format == RENDER_FMT_BYPASS)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - bypass: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - bypass: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
-     return;
-   }
-   SetVideoRect(m_sourceRect, m_destRect);
-@@ -542,9 +522,8 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-     CMMALVideoBuffer *omvb = buffer->MMALBuffer;
-     if (omvb && omvb->mmal_buffer)
-     {
--#if defined(MMAL_DEBUG_VERBOSE)
--      CLog::Log(LOGDEBUG, "%s::%s - MMAL: clear:%d flags:%x alpha:%d source:%d omvb:%p mmal:%p mflags:%x", CLASSNAME, __func__, clear, flags, alpha, source, omvb, omvb->mmal_buffer, omvb->mmal_buffer->flags);
--#endif
-+      if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+        CLog::Log(LOGDEBUG, "%s::%s - MMAL: clear:%d flags:%x alpha:%d source:%d omvb:%p mmal:%p mflags:%x", CLASSNAME, __func__, clear, flags, alpha, source, omvb, omvb->mmal_buffer, omvb->mmal_buffer->flags);
-       // we only want to upload frames once
-       if (omvb->mmal_buffer->flags & MMAL_BUFFER_HEADER_FLAG_USER1)
-         return;
-@@ -563,9 +542,8 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-     CYUVVideoBuffer *omvb = buffer->YUVBuffer;
-     if (omvb && omvb->mmal_buffer)
-     {
--#if defined(MMAL_DEBUG_VERBOSE)
--      CLog::Log(LOGDEBUG, "%s::%s - YUV: clear:%d flags:%x alpha:%d source:%d omvb:%p mmal:%p mflags:%x", CLASSNAME, __func__, clear, flags, alpha, source, omvb, omvb->mmal_buffer, omvb->mmal_buffer->flags);
--#endif
-+      if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+        CLog::Log(LOGDEBUG, "%s::%s - YUV: clear:%d flags:%x alpha:%d source:%d omvb:%p mmal:%p mflags:%x", CLASSNAME, __func__, clear, flags, alpha, source, omvb, omvb->mmal_buffer, omvb->mmal_buffer->flags);
-       // we only want to upload frames once
-       if (omvb->mmal_buffer->flags & MMAL_BUFFER_HEADER_FLAG_USER1)
-         return;
-@@ -589,15 +567,13 @@ void CMMALRenderer::FlipPage(int source)
-   CSingleLock lock(m_sharedSection);
-   if (!m_bConfigured || m_format == RENDER_FMT_BYPASS)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - not configured: source:%d", CLASSNAME, __func__, source);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - not configured: source:%d", CLASSNAME, __func__, source);
-     return;
-   }
- 
--#if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s - source:%d", CLASSNAME, __func__, source);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s - source:%d", CLASSNAME, __func__, source);
- 
-   m_iYV12RenderBuffer = source;
- }
-@@ -630,9 +606,8 @@ unsigned int CMMALRenderer::PreInit()
- 
- void CMMALRenderer::ReleaseBuffers()
- {
--#if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
-   for (int i=0; i<NUM_BUFFERS; i++)
-     ReleaseBuffer(i);
- }
-
-From 6a008777d9f6c0b2b8724fc66cecf74ac7c32383 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 23 Mar 2016 17:34:48 +0000
-Subject: [PATCH 91/93] rendermanager: Increase configure timeout to see if it
- fixes video playing in background issues
-
----
- xbmc/cores/VideoRenderers/RenderManager.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/VideoRenderers/RenderManager.cpp b/xbmc/cores/VideoRenderers/RenderManager.cpp
-index 7a99ac4..4b03c86 100644
---- a/xbmc/cores/VideoRenderers/RenderManager.cpp
-+++ b/xbmc/cores/VideoRenderers/RenderManager.cpp
-@@ -244,7 +244,7 @@ bool CXBMCRenderManager::Configure(unsigned int width, unsigned int height, unsi
-   CSingleLock    lock2(m_presentlock);
- 
-   /* make sure any queued frame was fully presented */
--  XbmcThreads::EndTime endtime(5000);
-+  XbmcThreads::EndTime endtime(10000);
-   while(m_presentstep != PRESENT_IDLE && m_presentstep != PRESENT_READY)
-   {
-     if(endtime.IsTimePast())
-
-From 67223b6440475c4797aa2aa1949f73c078114474 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 23 Mar 2016 17:39:47 +0000
-Subject: [PATCH 92/93] Revert "[rbp] Use default resampling setting on Pi2"
-
-This reverts commit e6b2f1693480ad5d8062acaed512393e72fb9b1d.
----
- system/settings/rbp2.xml | 5 -----
- 1 file changed, 5 deletions(-)
-
-diff --git a/system/settings/rbp2.xml b/system/settings/rbp2.xml
-index 52778ec..8cc8f19 100644
---- a/system/settings/rbp2.xml
-+++ b/system/settings/rbp2.xml
-@@ -23,11 +23,6 @@
-         <setting id="audiooutput.ac3transcode" help="36429">
-         </setting>
-       </group>
--      <group id="1">
--        <setting id="audiooutput.processquality">
--          <default>30</default> <!-- AE_QUALITY_MID -->
--        </setting>
--      </group>
-     </category>
-   </section>
- </settings>
-
-From bb33be4220a3fd1ad131ec1f2218f7b4750fda98 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 31 Mar 2016 20:00:15 +0100
-Subject: [PATCH 93/93] Revert "[rbp] Make sync playback to display the default
- option"
-
-This reverts commit 492a2e7ac5fb1895b71b62f68918e74db053f0b9.
----
- system/settings/rbp.xml | 7 -------
- 1 file changed, 7 deletions(-)
-
-diff --git a/system/settings/rbp.xml b/system/settings/rbp.xml
-index f2a6892..1506035 100644
---- a/system/settings/rbp.xml
-+++ b/system/settings/rbp.xml
-@@ -1,13 +1,6 @@
- <?xml version="1.0" encoding="utf-8" ?>
- <settings>
-   <section id="videos">
--    <category id="videoplayer">
--      <group id="3">
--        <setting id="videoplayer.usedisplayasclock">
--          <default>true</default>
--        </setting>
--      </group>
--    </category>
-     <category id="videoacceleration">
-       <group id="1">
-         <visible>false</visible>
diff --git a/projects/RPi2/patches/kodi-theme-Confluence/kodi-theme-Confluence-001-jarvis-rbp-backports.patch b/projects/RPi2/patches/kodi-theme-Confluence/kodi-theme-Confluence-001-jarvis-rbp-backports.patch
deleted file mode 100644
index 84ea8d1914..0000000000
--- a/projects/RPi2/patches/kodi-theme-Confluence/kodi-theme-Confluence-001-jarvis-rbp-backports.patch
+++ /dev/null
@@ -1,117 +0,0 @@
-From 01759c5adfb050b1ba0c8a8fc4e20a875a98c0e5 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 10 Aug 2014 18:58:37 +0100
-Subject: [PATCH 1/2] confluence: Remove media-overlay.jpg for when video is
- backgrounded
-
----
- 720p/IncludesBackgroundBuilding.xml | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/720p/IncludesBackgroundBuilding.xml b/720p/IncludesBackgroundBuilding.xml
-index cc996da..d6aa595 100644
---- a/720p/IncludesBackgroundBuilding.xml
-+++ b/720p/IncludesBackgroundBuilding.xml
-@@ -32,7 +32,7 @@
- 		</control>
- 		<control type="image">
- 			<include>BackgroundDimensions</include>
--			<texture>special://skin/backgrounds/media-overlay.jpg</texture>
-+			<!--texture>special://skin/backgrounds/media-overlay.jpg</texture-->
- 			<visible>[Player.HasVideo + !Skin.HasSetting(ShowBackgroundVideo)] + !Window.IsVisible(TVChannels) + !Window.IsVisible(RadioChannels)</visible>
- 			<include>VisibleFadeEffect</include>
- 		</control>
--- 
-2.5.0
-
-
-From d99f70c094006144f07bdf739f5847b733030245 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 22 Jul 2013 23:19:15 +0100
-Subject: [PATCH 2/2] [confluence] Use animated gif as a cheaper working
- spinner
-
----
- 720p/DialogBusy.xml |   4 ++--
- media/busy.gif      | Bin 0 -> 3199 bytes
- 2 files changed, 2 insertions(+), 2 deletions(-)
- create mode 100644 media/busy.gif
-
-diff --git a/720p/DialogBusy.xml b/720p/DialogBusy.xml
-index b5df91f..8e84fed 100644
---- a/720p/DialogBusy.xml
-+++ b/720p/DialogBusy.xml
-@@ -25,9 +25,9 @@
- 				<top>20</top>
- 				<width>32</width>
- 				<height>32</height>
--				<texture>busy.png</texture>
-+				<texture>busy.gif</texture>
- 				<aspectratio>keep</aspectratio>
--				<animation effect="rotate" start="0" end="360" center="36,36" time="900" loop="true" condition="true">conditional</animation>
-+				<!--animation effect="rotate" start="0" end="360" center="36,36" time="900" loop="true" condition="true">conditional</animation-->
- 			</control>
- 			<control type="label">
- 				<description>Busy label</description>
-diff --git a/media/busy.gif b/media/busy.gif
-new file mode 100644
-index 0000000000000000000000000000000000000000..f856ed0b44fdc7e9b0520c7e39a9bebc04136897
-GIT binary patch
-literal 3199
-zcmc(fX;f2Z9>y=ZH#f_@I(Toefj|vNK&XX)Kv`7?5Fj9nfFK|eqk^a)i$D+vOGrT0
-z5Y~W_Kw^kiL|j0tWs!ndI9M0djt<b)1^0UFSf^(ku;<M510Uv`Ij8e}c;7GYd*1VV
-z{?GGp{e7KX5|{uJcmpi14<9}_J39-7LO(x0fj}@eHm22TEi5dIjEuazyk=%*-oJnU
-z@#5lU`YS}SUp)V5#7Wzkkg-$1%u32gWG3uM$ljZhAh1c=zbgeW{eW!p3E#}|4DqqI
-zVS{=L57sWeUi=h*9_f$yCnc!oCm!*6V)Hg%)Sx4b;3*G3y_Tf5-#ll{cC@v<x=)pA
-zDy$p)mTgGTcQYShbM#tm2>#Zlzxi?2cy3&gq6Zb5u5$@@ybycc*-E=i(!PE0#hKFr
-z8YwHA>k$o6chjKaEI)p!$wAoDL&l~=#<s>v-6#Yi*r{$k>ZaBy{Iun&<0smBO`6!}
-z&StP`{nE=0r$xK_71$KwBW33sD3@jYuko!${Lx^a-L6nm0XI6%X29c~=ih=uThmJ)
-z0`sx_;Y1skXCCRh-xeI%QSoNyoC$Wv8mqotx(oQi`#)m+fqj0FJ-Ty#;#jn=BdyK1
-z=B6qP9+rpm>D&te;)?^%P6X=O1hYo3EX2|KvKf!7<in@|#EeC0w2e)Mm}q{`poK|>
-zPC<6`-z>t4dt|sDMs?7$3qAAwp-9+}MG!&<X;8uHJ!}cJinh0cyE~Vi50<$Xx>HJG
-zR56mo+HAFw+XYc`x>Q}qx+NXw8e)#H8!s@v;-0BxXf+|F!d7fF@#<yem8N9j5SC=r
-zc$J(})qZ_E@sRLVqKtdLWG-?<)m``mc;5dsHW7am(_Qrv810mnzqP3UthRm2kuT0W
-z&lrxG68`O#Kj6Puz_+qnz<*@Nbri&j*VHS+Mo-Jb?JMMa0!wm-Z@4q@m?_7ZAll^_
-zDtpS%N^D4F2KKG&^e2?Q+9#gy(T;AXoq>p{m^y-r6hhl}(OBf7Z9T-XA592wChCS0
-zghE0dyCjN1p_XlCgFCZKa-d{)QY(vcG`ORqtd3Hv5q9cwWnZ4q9qQ6`p6gJt`i)yI
-z(k}USCso)D$=T8}kwkOttLQTdyk*$=N_P3zvI>gu=+%v7SlP7u{o}wK&@wvs;D^UR
-z_hrU2=hu&--McI1v8!sb^Bc*rSV@S1`PqwVQ`z9M6-zo`(fwU?)%?V6L!Su!wg%>{
-z_Ap6;JlsqFZs&4*_Hyca%X9UeJIQa&rOjqikQOUh?PJH$+b_COMJv9VKYa@bFkHLx
-zuWLOM>d88zwur244toBqkoynF%QKPCmFw0Ka@mD&l3lo%OWd7hoRh}_r7X*^);1h%
-zS0Y48mAJw~sY5l#+K!_g>SODUoZ~jI`sJw?F0nP#X1q5bEJf^ykCn3huW0t=Buk;;
-z><OG4E{)F{TT=~)r?<_`<?!07($8YgTA!>>ir2ELZoSz5-s3JdcW?Z_9fzG0p9c+f
-zF75u0<G0q%Pgq*{s0DHLH8@=}91=$HB+Fe*zx;<VO;%Lb%Fn7>#&QywaXZ%t%wx<X
-zvUdSJpAPsYepv&92^b?33aQ+l$}%1*we|81iQN&gY;M_x@Tm;K7TZ*q#I*>{@Fph2
-z8>d4#CVRatgPNLgq_yn;r95LtQ=1x5s+u(jzg=bD(&G)*)dYbk>S|JZ)~LT9sTL@u
-zECM`$!UZY^R=O23aIrSC7%KKzN9#j7fanO3;b9=-Uc<wOwcmJ`n?DRl3VQB76FuWr
-ze&-Y}Y7)5DF@p(38BssGAF%vvh1c?>$Ny%r0K&Q!3cxIY#cXS+p(!;!p<8t|C$aCa
-zmw^IV`QJZ}KB=L;O)P3iejKViBr3riXEU+Os6+HSe_FAg!SQwiv}qZC3nHl$W!6d@
-z71S4lX+^(KPOx(Lj8E2YV-Zqp(+-;Kb+HVT)!}eMBgt)5sl=u6D6D9$YloY~9dexu
-zC}LF_6A=VFoaEBm3%6lfCWlPwM@VL+UM3*w1;PDiz0>h(8VN;V8aTNv3fh%t9zvN$
-za|0i22m!`XM6L*S0Fdda0iPRhpr2%l$fxG?zK7#0^aKDQU~07$CjLum<cgsI^Cg?$
-z=^u0uFIa(TpbQ(@x~a@M#9Ht8tlKT~kX&o|rlNu8u|88ZKbw*Z>>`5g?|W04<I|k{
-zf8kS=oj={Y;@8;XOz~=TqntirW=YnyZvFmV#w8iwSj1b+vE)h1aIm71n3cCdS}6x>
-zWxRT@I!WkM-tG!!7x5q%CKh;dv|7?pi#(SC(n*+<A&fO4BC*gpt<Kw#?v@QANIy>0
-z_JkG+)}Xfm$~B~iw*|Y6V&e#09^^HH!`DEm|CuKkbWa-O@fsst>tv+k@8<e0IeRDQ
-z6%&;KwCx)MxzJKg`0~%-=L+cyz|v~xJYkuvQl4L~U`3Vi2q5TUQtYbI7P9j?Srnoc
-zBrcLeuot<fb1!`d3L1ehObS%-LknRP9YwAKlo7Z_N}NOuuOk?7WV8&K90Km+q>)}8
-zIQam3jP;F3e?*@>`tHd{w(%sqW=s(>)3Q9u09{KSdvd$zkgeI0Rru=vq=DF#Oe*oT
-zUC2Jj$)htMM58IqjK$$@6z*jZz{EVhT7wY1hm0T)9FAM%LDXn*5`t*q1rLmYg^^Y_
-z5X0LrM!i=K-<X=iiDNDt@Wv0=Q}2ho$4CwE7JL2`Pa{@kR#MJmqt?c3cFpEb!SRfx
-zJMm8m!h#Wq2wGEHCUm6MHllU%%Ic=_qa3j-1zZ#YL5LrPfOWb>AxOOl={&<{L5D6K
-z>;#ZUjACfyRUVK|-3P;Ja2nW>ZDDm2Hk|&>-$hEgc><W-n8G36B;oVn8+Z@Cd2$CA
-ze9w4E`#J92pw}150idYeiA<k5?!9CX9{*1!Afw-i3_cctKAXI5B~jNaPrTKK7b_A5
-z8uu_GFd~x6@lQ71$LFn<?BLbPFp)E*BsW1;@7%m!QBQAbu2V~jB5PcwwnP{{;2KoO
-zppv?~Ihmq-;Cvsc%*ZFPH~C~Y$qAr<mw-hT;=6l_LC`70-8O=J7mW7vmCd?rz=DQY
-z)LHJeZ=Y~xlivUbwT!#3*W1LsccOK@GL={@|AY`J{cjQwf&SitlwrC&U|`R?P4?+L
-zr)p}C0u>zg*B6Z$SRsZ~{lGhZr?x1$s1t=S{s}LNuEC0*wfD$uoK%8ST1Gwr7S<)+
-z9iovkGf8Q-PRuArxY*w<zSKAaAqKDZq2vZD6jGN+60GWIJ>t@#b!a<6hm>8dJ&H!R
-z$Oeur6$C~7PILzO;&4;G^(D-5m{@KUl(MQ1yKs>x+CMqUUllb3x?=fppG8_v>@)ZY
-xGUUb21Yb|%0k3;EZL|UA71ss70{N2%rZhjU!V8TtZxzYw_F;tVxR3z1e*p+ht@8i?
-
-literal 0
-HcmV?d00001
-
--- 
-2.5.0
-
diff --git a/projects/RPi2/patches/kodi/kodi-001-jarvis-rbp-backports.patch b/projects/RPi2/patches/kodi/kodi-001-jarvis-rbp-backports.patch
deleted file mode 100644
index fa712e8b6a..0000000000
--- a/projects/RPi2/patches/kodi/kodi-001-jarvis-rbp-backports.patch
+++ /dev/null
@@ -1,52162 +0,0 @@
-From d11fabefb909e75e7186bd9ecd0cbff9e8b24577 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 7 Sep 2015 19:11:14 +0100
-Subject: [PATCH 01/93] Enable concealed error frames, but discard them when
- returned
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 17 ++++++++---------
- 1 file changed, 8 insertions(+), 9 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index bebe136..727a9ea 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -255,11 +255,14 @@ void CMMALVideo::dec_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf
-     if (buffer->length > 0)
-     {
-       assert(!(buffer->flags & MMAL_BUFFER_HEADER_FLAG_DECODEONLY));
-+      CMMALVideoBuffer *omvb = NULL;
-+      if (!g_advancedSettings.m_omxDecodeStartWithValidFrame || !(buffer->flags & MMAL_BUFFER_HEADER_FLAG_CORRUPTED))
-+        omvb = new CMMALVideoBuffer(this);
-+      if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+        CLog::Log(LOGDEBUG, "%s::%s - %p (%p) buffer_size(%u) dts:%.3f pts:%.3f flags:%x:%x",
-+          CLASSNAME, __func__, buffer, omvb, buffer->length, buffer->dts*1e-6, buffer->pts*1e-6, buffer->flags, buffer->type->video.flags);
-+      if (omvb)
-       {
--        CMMALVideoBuffer *omvb = new CMMALVideoBuffer(this);
--        if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--          CLog::Log(LOGDEBUG, "%s::%s - %p (%p) buffer_size(%u) dts:%.3f pts:%.3f flags:%x:%x",
--            CLASSNAME, __func__, buffer, omvb, buffer->length, buffer->dts*1e-6, buffer->pts*1e-6, buffer->flags, buffer->type->video.flags);
-         omvb->mmal_buffer = buffer;
-         buffer->user_data = (void *)omvb;
-         omvb->width = m_decoded_width;
-@@ -521,7 +524,6 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   m_hints = hints;
-   m_vout_input_pool = (MMAL_POOL_T *)options.m_opaque_pointer;
-   MMAL_STATUS_T status;
--  MMAL_PARAMETER_BOOLEAN_T error_concealment;
- 
-   m_decoded_width  = hints.width;
-   m_decoded_height = hints.height;
-@@ -630,10 +632,7 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   }
-   m_dec_input->format->flags |= MMAL_ES_FORMAT_FLAG_FRAMED;
- 
--  error_concealment.hdr.id = MMAL_PARAMETER_VIDEO_DECODE_ERROR_CONCEALMENT;
--  error_concealment.hdr.size = sizeof(MMAL_PARAMETER_BOOLEAN_T);
--  error_concealment.enable = g_advancedSettings.m_omxDecodeStartWithValidFrame;
--  status = mmal_port_parameter_set(m_dec_input, &error_concealment.hdr);
-+  status = mmal_port_parameter_set_boolean(m_dec_input, MMAL_PARAMETER_VIDEO_DECODE_ERROR_CONCEALMENT, MMAL_FALSE);
-   if (status != MMAL_SUCCESS)
-     CLog::Log(LOGERROR, "%s::%s Failed to disable error concealment on %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
- 
-
-From 5fb2a476f902f028de46e46863fdc74b4c021371 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 26 Aug 2015 21:47:41 +0100
-Subject: [PATCH 02/93] Reduce framerate of high framerate videos when not
- running fullscreen
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 14 +++++++++++++-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  2 ++
- 2 files changed, 15 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 727a9ea..8211e94 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -126,6 +126,8 @@ CMMALVideo::CMMALVideo()
-   m_es_format = mmal_format_alloc();
-   m_preroll = true;
-   m_speed = DVD_PLAYSPEED_NORMAL;
-+  m_fps = 0.0f;
-+  m_num_decoded = 0;
- }
- 
- CMMALVideo::~CMMALVideo()
-@@ -256,8 +258,15 @@ void CMMALVideo::dec_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf
-     {
-       assert(!(buffer->flags & MMAL_BUFFER_HEADER_FLAG_DECODEONLY));
-       CMMALVideoBuffer *omvb = NULL;
--      if (!g_advancedSettings.m_omxDecodeStartWithValidFrame || !(buffer->flags & MMAL_BUFFER_HEADER_FLAG_CORRUPTED))
-+      bool wanted = true;
-+      // we don't keep up when running at 60fps in the background so switch to half rate
-+      if (m_fps > 40.0f && !g_graphicsContext.IsFullScreenVideo() && !(m_num_decoded & 1))
-+        wanted = false;
-+      if (g_advancedSettings.m_omxDecodeStartWithValidFrame && (buffer->flags & MMAL_BUFFER_HEADER_FLAG_CORRUPTED))
-+        wanted = false;
-+      if (wanted)
-         omvb = new CMMALVideoBuffer(this);
-+      m_num_decoded++;
-       if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-         CLog::Log(LOGDEBUG, "%s::%s - %p (%p) buffer_size(%u) dts:%.3f pts:%.3f flags:%x:%x",
-           CLASSNAME, __func__, buffer, omvb, buffer->length, buffer->dts*1e-6, buffer->pts*1e-6, buffer->flags, buffer->type->video.flags);
-@@ -629,7 +638,10 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   {
-     m_dec_input->format->es->video.frame_rate.num = hints.fpsrate;
-     m_dec_input->format->es->video.frame_rate.den = hints.fpsscale;
-+    m_fps = hints.fpsrate / hints.fpsscale;
-   }
-+  else
-+    m_fps = 0.0f;
-   m_dec_input->format->flags |= MMAL_ES_FORMAT_FLAG_FRAMED;
- 
-   status = mmal_port_parameter_set_boolean(m_dec_input, MMAL_PARAMETER_VIDEO_DECODE_ERROR_CONCEALMENT, MMAL_FALSE);
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index d081b9c..0ea6ecd 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -123,6 +123,8 @@ protected:
-   bool SendCodecConfigData();
- 
-   CDVDStreamInfo    m_hints;
-+  float             m_fps;
-+  unsigned          m_num_decoded;
-   // Components
-   MMAL_INTERLACETYPE_T m_interlace_mode;
-   EINTERLACEMETHOD  m_interlace_method;
-
-From 8f815de22d00759496cd60139fb497d4064002cf Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 2 Dec 2015 20:08:05 +0000
-Subject: [PATCH 03/93] Remove preroll
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 12 ++----------
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  1 -
- 2 files changed, 2 insertions(+), 11 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 8211e94..8468db9 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -124,7 +124,6 @@ CMMALVideo::CMMALVideo()
- 
-   m_demux_queue_length = 0;
-   m_es_format = mmal_format_alloc();
--  m_preroll = true;
-   m_speed = DVD_PLAYSPEED_NORMAL;
-   m_fps = 0.0f;
-   m_num_decoded = 0;
-@@ -718,7 +717,6 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-     return false;
- 
-   Prime();
--  m_preroll = !m_hints.stills;
-   m_speed = DVD_PLAYSPEED_NORMAL;
- 
-   return true;
-@@ -874,13 +872,8 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-   double queued = m_decoderPts != DVD_NOPTS_VALUE && m_demuxerPts != DVD_NOPTS_VALUE ? m_demuxerPts - m_decoderPts : 0.0;
-   if (mmal_queue_length(m_dec_input_pool->queue) > 0 && !m_demux_queue_length && queued <= DVD_MSEC_TO_TIME(1000))
-     ret |= VC_BUFFER;
--  else
--    m_preroll = false;
--
--  if (m_preroll && m_output_ready.size() >= GetAllowedReferences())
--    m_preroll = false;
- 
--  if (!m_output_ready.empty() && !m_preroll)
-+  if (!m_output_ready.empty())
-   {
-     ret |= VC_PICTURE;
-   }
-@@ -888,7 +881,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     Sleep(10); // otherwise we busy spin
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) demux_queue(%d) space(%d) queued(%.2f) preroll(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), m_demux_queue_length, mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size, queued*1e-6, m_preroll);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) demux_queue(%d) space(%d) queued(%.2f)", CLASSNAME, __func__, ret, m_output_ready.size(), m_demux_queue_length, mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size, queued*1e-6);
- 
-   return ret;
- }
-@@ -958,7 +951,6 @@ void CMMALVideo::Reset(void)
-   }
-   m_decoderPts = DVD_NOPTS_VALUE;
-   m_demuxerPts = DVD_NOPTS_VALUE;
--  m_preroll = !m_hints.stills && (m_speed == DVD_PLAYSPEED_NORMAL || m_speed == DVD_PLAYSPEED_PAUSE);
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index 0ea6ecd..50ac0e3 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -131,7 +131,6 @@ protected:
-   double            m_demuxerPts;
-   double            m_decoderPts;
-   int               m_speed;
--  bool              m_preroll;
- 
-   CCriticalSection m_sharedSection;
-   MMAL_COMPONENT_T *m_dec;
-
-From 18c08e9c0410f43d2deec9d69e64eca7fdfd9a17 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 2 Dec 2015 22:35:11 +0000
-Subject: [PATCH 04/93] Remove demux queue
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 81 ++++------------------
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   | 10 ---
- 2 files changed, 12 insertions(+), 79 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 8468db9..61ae7e7 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -122,7 +122,6 @@ CMMALVideo::CMMALVideo()
- 
-   m_codingType = 0;
- 
--  m_demux_queue_length = 0;
-   m_es_format = mmal_format_alloc();
-   m_speed = DVD_PLAYSPEED_NORMAL;
-   m_fps = 0.0f;
-@@ -742,55 +741,13 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-   //  CLog::Log(LOGDEBUG, "%s::%s - %-8p %-6d dts:%.3f pts:%.3f ready_queue(%d)",
-   //    CLASSNAME, __func__, pData, iSize, dts == DVD_NOPTS_VALUE ? 0.0 : dts*1e-6, pts == DVD_NOPTS_VALUE ? 0.0 : pts*1e-6, m_output_ready.size());
- 
--  unsigned int demuxer_bytes = 0;
--  uint8_t *demuxer_content = NULL;
-   MMAL_BUFFER_HEADER_T *buffer;
-   MMAL_STATUS_T status;
- 
-   Prime();
--  // we need to queue then de-queue the demux packet, seems silly but
--  // mmal might not have an input buffer available when we are called
--  // and we must store the demuxer packet and try again later.
--  // try to send any/all demux packets to mmal decoder.
--  unsigned space = mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size;
--  if (pData && m_demux_queue.empty() && space >= (unsigned int)iSize)
--  {
--    demuxer_bytes = iSize;
--    demuxer_content = pData;
--  }
--  else if (pData && iSize)
--  {
--    mmal_demux_packet demux_packet;
--    demux_packet.dts = dts;
--    demux_packet.pts = pts;
--    demux_packet.size = iSize;
--    demux_packet.buff = new uint8_t[iSize];
--    memcpy(demux_packet.buff, pData, iSize);
--    m_demux_queue_length += demux_packet.size;
--    m_demux_queue.push(demux_packet);
--  }
--
--  uint8_t *buffer_to_free = NULL;
--
-   while (1)
-   {
--     space = mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size;
--     if (!demuxer_bytes && !m_demux_queue.empty())
--     {
--       mmal_demux_packet &demux_packet = m_demux_queue.front();
--       if (space >= (unsigned int)demux_packet.size)
--       {
--         // need to lock here to retrieve an input buffer and pop the queue
--         m_demux_queue_length -= demux_packet.size;
--         m_demux_queue.pop();
--         demuxer_bytes = (unsigned int)demux_packet.size;
--         demuxer_content = demux_packet.buff;
--         buffer_to_free = demux_packet.buff;
--         dts = demux_packet.dts;
--         pts = demux_packet.pts;
--       }
--     }
--     if (demuxer_content)
-+     if (pData)
-      {
-        // 500ms timeout
-        buffer = mmal_queue_timedwait(m_dec_input_pool->queue, 500);
-@@ -805,20 +762,20 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-        buffer->pts = pts == DVD_NOPTS_VALUE ? MMAL_TIME_UNKNOWN : pts;
-        buffer->dts = dts == DVD_NOPTS_VALUE ? MMAL_TIME_UNKNOWN : dts;
-        if (m_hints.ptsinvalid) buffer->pts = MMAL_TIME_UNKNOWN;
--       buffer->length = demuxer_bytes > buffer->alloc_size ? buffer->alloc_size : demuxer_bytes;
-+       buffer->length = (uint32_t)iSize > buffer->alloc_size ? buffer->alloc_size : (uint32_t)iSize;
-        // set a flag so we can identify primary frames from generated frames (deinterlace)
-        buffer->flags = MMAL_BUFFER_HEADER_FLAG_USER0;
- 
--       memcpy(buffer->data, demuxer_content, buffer->length);
--       demuxer_bytes   -= buffer->length;
--       demuxer_content += buffer->length;
-+       memcpy(buffer->data, pData, buffer->length);
-+       iSize -= buffer->length;
-+       pData += buffer->length;
- 
--       if (demuxer_bytes == 0)
-+       if (iSize == 0)
-          buffer->flags |= MMAL_BUFFER_HEADER_FLAG_FRAME_END;
- 
-        if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--         CLog::Log(LOGDEBUG, "%s::%s - %-8p %-6d/%-6d dts:%.3f pts:%.3f flags:%x ready_queue(%d) demux_queue(%d) space(%d)",
--            CLASSNAME, __func__, buffer, buffer->length, demuxer_bytes, dts == DVD_NOPTS_VALUE ? 0.0 : dts*1e-6, pts == DVD_NOPTS_VALUE ? 0.0 : pts*1e-6, buffer->flags, m_output_ready.size(), m_demux_queue_length, mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size);
-+         CLog::Log(LOGDEBUG, "%s::%s - %-8p %-6d/%-6d dts:%.3f pts:%.3f flags:%x ready_queue(%d)",
-+            CLASSNAME, __func__, buffer, buffer->length, iSize, dts == DVD_NOPTS_VALUE ? 0.0 : dts*1e-6, pts == DVD_NOPTS_VALUE ? 0.0 : pts*1e-6, buffer->flags, m_output_ready.size());
-        assert((int)buffer->length > 0);
-        status = mmal_port_send_buffer(m_dec_input, buffer);
-        if (status != MMAL_SUCCESS)
-@@ -827,7 +784,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-          return VC_ERROR;
-        }
- 
--       if (demuxer_bytes == 0)
-+       if (iSize == 0)
-        {
-          EDEINTERLACEMODE deinterlace_request = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_DeinterlaceMode;
-          EINTERLACEMETHOD interlace_method = g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
-@@ -851,17 +808,9 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-            DestroyDeinterlace();
-          if (deinterlace && !m_deint)
-            CreateDeinterlace(interlace_method);
--
--         if (buffer_to_free)
--         {
--           delete [] buffer_to_free;
--           buffer_to_free = NULL;
--           demuxer_content = NULL;
--           continue;
--         }
-        }
-     }
--    if (!demuxer_bytes)
-+    if (!iSize)
-       break;
-   }
-   int ret = 0;
-@@ -870,7 +819,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-   else if (dts != DVD_NOPTS_VALUE)
-     m_demuxerPts = dts;
-   double queued = m_decoderPts != DVD_NOPTS_VALUE && m_demuxerPts != DVD_NOPTS_VALUE ? m_demuxerPts - m_decoderPts : 0.0;
--  if (mmal_queue_length(m_dec_input_pool->queue) > 0 && !m_demux_queue_length && queued <= DVD_MSEC_TO_TIME(1000))
-+  if (mmal_queue_length(m_dec_input_pool->queue) > 0 && queued <= DVD_MSEC_TO_TIME(1000))
-     ret |= VC_BUFFER;
- 
-   if (!m_output_ready.empty())
-@@ -881,7 +830,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     Sleep(10); // otherwise we busy spin
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) demux_queue(%d) space(%d) queued(%.2f)", CLASSNAME, __func__, ret, m_output_ready.size(), m_demux_queue_length, mmal_queue_length(m_dec_input_pool->queue) * m_dec_input->buffer_size, queued*1e-6);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) queued(%.2f)", CLASSNAME, __func__, ret, m_output_ready.size(), queued*1e-6);
- 
-   return ret;
- }
-@@ -938,12 +887,6 @@ void CMMALVideo::Reset(void)
-       break;
-   }
- 
--  pthread_mutex_lock(&m_output_mutex);
--  while (!m_demux_queue.empty())
--    m_demux_queue.pop();
--  m_demux_queue_length = 0;
--  pthread_mutex_unlock(&m_output_mutex);
--
-   if (!m_finished)
-   {
-     SendCodecConfigData();
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index 50ac0e3..f4df09c 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -64,13 +64,6 @@ private:
- 
- class CMMALVideo : public CDVDVideoCodec
- {
--  typedef struct mmal_demux_packet {
--    uint8_t *buff;
--    int size;
--    double dts;
--    double pts;
--  } mmal_demux_packet;
--
- public:
-   CMMALVideo();
-   virtual ~CMMALVideo();
-@@ -110,9 +103,6 @@ protected:
-   float             m_aspect_ratio;
-   const char        *m_pFormatName;
- 
--  std::queue<mmal_demux_packet> m_demux_queue;
--  unsigned           m_demux_queue_length;
--
-   // mmal output buffers (video frames)
-   pthread_mutex_t   m_output_mutex;
-   std::queue<CMMALVideoBuffer*> m_output_ready;
-
-From 432994f3a9e9867d04d4c3d360476d72acea0a6c Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 2 Dec 2015 20:10:33 +0000
-Subject: [PATCH 05/93] Remove time based limit on submitted packets
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 24 +++-------------------
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  2 --
- 2 files changed, 3 insertions(+), 23 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 61ae7e7..1674fdd 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -108,8 +108,6 @@ CMMALVideo::CMMALVideo()
- 
-   m_interlace_mode = MMAL_InterlaceProgressive;
-   m_interlace_method = VS_INTERLACEMETHOD_NONE;
--  m_decoderPts = DVD_NOPTS_VALUE;
--  m_demuxerPts = DVD_NOPTS_VALUE;
- 
-   m_dec = NULL;
-   m_dec_input = NULL;
-@@ -814,23 +812,14 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-       break;
-   }
-   int ret = 0;
--  if (pts != DVD_NOPTS_VALUE)
--    m_demuxerPts = pts;
--  else if (dts != DVD_NOPTS_VALUE)
--    m_demuxerPts = dts;
--  double queued = m_decoderPts != DVD_NOPTS_VALUE && m_demuxerPts != DVD_NOPTS_VALUE ? m_demuxerPts - m_decoderPts : 0.0;
--  if (mmal_queue_length(m_dec_input_pool->queue) > 0 && queued <= DVD_MSEC_TO_TIME(1000))
--    ret |= VC_BUFFER;
- 
-   if (!m_output_ready.empty())
--  {
-     ret |= VC_PICTURE;
--  }
--  if (!ret)
--    Sleep(10); // otherwise we busy spin
-+  else
-+    ret |= VC_BUFFER;
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) queued(%.2f)", CLASSNAME, __func__, ret, m_output_ready.size(), queued*1e-6);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d)", CLASSNAME, __func__, ret, m_output_ready.size());
- 
-   return ret;
- }
-@@ -892,8 +881,6 @@ void CMMALVideo::Reset(void)
-     SendCodecConfigData();
-     Prime();
-   }
--  m_decoderPts = DVD_NOPTS_VALUE;
--  m_demuxerPts = DVD_NOPTS_VALUE;
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-@@ -982,11 +969,6 @@ bool CMMALVideo::GetPicture(DVDVideoPicture* pDvdVideoPicture)
-     return false;
-   }
- 
--  if (pDvdVideoPicture->pts != DVD_NOPTS_VALUE)
--    m_decoderPts = pDvdVideoPicture->pts;
--  else
--    m_decoderPts = pDvdVideoPicture->dts;
--
-   return true;
- }
- 
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index f4df09c..8f84557 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -118,8 +118,6 @@ protected:
-   // Components
-   MMAL_INTERLACETYPE_T m_interlace_mode;
-   EINTERLACEMETHOD  m_interlace_method;
--  double            m_demuxerPts;
--  double            m_decoderPts;
-   int               m_speed;
- 
-   CCriticalSection m_sharedSection;
-
-From 14ec8859335b4dc5add80bed34ce21ab3a4c8df4 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 7 Dec 2015 22:18:47 +0000
-Subject: [PATCH 06/93] Add back logging of data queued in decoder
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 22 +++++++++++++++++++++-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  2 ++
- 2 files changed, 23 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 1674fdd..35a9847 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -108,6 +108,8 @@ CMMALVideo::CMMALVideo()
- 
-   m_interlace_mode = MMAL_InterlaceProgressive;
-   m_interlace_method = VS_INTERLACEMETHOD_NONE;
-+  m_decoderPts = DVD_NOPTS_VALUE;
-+  m_demuxerPts = DVD_NOPTS_VALUE;
- 
-   m_dec = NULL;
-   m_dec_input = NULL;
-@@ -252,6 +254,11 @@ void CMMALVideo::dec_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf
-   {
-     if (buffer->length > 0)
-     {
-+      if (buffer->pts != MMAL_TIME_UNKNOWN)
-+        m_decoderPts = buffer->pts;
-+      else if (buffer->dts != MMAL_TIME_UNKNOWN)
-+        m_decoderPts = buffer->dts;
-+
-       assert(!(buffer->flags & MMAL_BUFFER_HEADER_FLAG_DECODEONLY));
-       CMMALVideoBuffer *omvb = NULL;
-       bool wanted = true;
-@@ -811,6 +818,17 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     if (!iSize)
-       break;
-   }
-+  if (pts != DVD_NOPTS_VALUE)
-+    m_demuxerPts = pts;
-+  else if (dts != DVD_NOPTS_VALUE)
-+    m_demuxerPts = dts;
-+
-+  if (m_demuxerPts != DVD_NOPTS_VALUE && m_decoderPts == DVD_NOPTS_VALUE)
-+    m_decoderPts = m_demuxerPts;
-+
-+  // we've built up quite a lot of data in decoder - try to throttle it
-+  double queued = m_decoderPts != DVD_NOPTS_VALUE && m_demuxerPts != DVD_NOPTS_VALUE ? m_demuxerPts - m_decoderPts : 0.0;
-+  bool full = queued > DVD_MSEC_TO_TIME(1000);
-   int ret = 0;
- 
-   if (!m_output_ready.empty())
-@@ -819,7 +837,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     ret |= VC_BUFFER;
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d)", CLASSNAME, __func__, ret, m_output_ready.size());
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) queued(%.2f) (%.2f:%.2f) full(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full);
- 
-   return ret;
- }
-@@ -881,6 +899,8 @@ void CMMALVideo::Reset(void)
-     SendCodecConfigData();
-     Prime();
-   }
-+  m_decoderPts = DVD_NOPTS_VALUE;
-+  m_demuxerPts = DVD_NOPTS_VALUE;
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index 8f84557..f4df09c 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -118,6 +118,8 @@ protected:
-   // Components
-   MMAL_INTERLACETYPE_T m_interlace_mode;
-   EINTERLACEMETHOD  m_interlace_method;
-+  double            m_demuxerPts;
-+  double            m_decoderPts;
-   int               m_speed;
- 
-   CCriticalSection m_sharedSection;
-
-From 61928feb51d23e4550abfbf8ab26e933ff1fec4e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 8 Dec 2015 11:40:17 +0000
-Subject: [PATCH 07/93] Try to minimise latency through hardware decoder. This
- could reduce performance but keeps videoplayer happier
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 42 +++++++++++++++++-----
- 1 file changed, 33 insertions(+), 9 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 35a9847..f96cc14 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -659,14 +659,21 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   if (status != MMAL_SUCCESS)
-     CLog::Log(LOGERROR, "%s::%s Failed to disable interpolate timestamps mode on %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
- 
-+  // limit number of callback structures in video_decode to reduce latency. Too low and video hangs.
-+  // negative numbers have special meaning. -1=size of DPB -2=size of DPB+1
-+  status = mmal_port_parameter_set_uint32(m_dec_input, MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS, -3);
-+  if (status != MMAL_SUCCESS)
-+    CLog::Log(LOGERROR, "%s::%s Failed to configure max num callbacks on %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
-+
-   status = mmal_port_format_commit(m_dec_input);
-   if (status != MMAL_SUCCESS)
-   {
-     CLog::Log(LOGERROR, "%s::%s Failed to commit format for decoder input port %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
-     return false;
-   }
--  m_dec_input->buffer_size = m_dec_input->buffer_size_recommended;
--  m_dec_input->buffer_num = m_dec_input->buffer_num_recommended;
-+  // use a small number of large buffers to keep latency under control
-+  m_dec_input->buffer_size = 1024*1024;
-+  m_dec_input->buffer_num = 2;
- 
-   m_dec_input->userdata = (struct MMAL_PORT_USERDATA_T *)this;
-   status = mmal_port_enable(m_dec_input, dec_input_port_cb_static);
-@@ -755,13 +762,15 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-      if (pData)
-      {
-        // 500ms timeout
--       buffer = mmal_queue_timedwait(m_dec_input_pool->queue, 500);
--       if (!buffer)
-        {
--         CLog::Log(LOGERROR, "%s::%s - mmal_queue_get failed", CLASSNAME, __func__);
--         return VC_ERROR;
-+         CSingleExit unlock(m_sharedSection);
-+         buffer = mmal_queue_timedwait(m_dec_input_pool->queue, 500);
-+         if (!buffer)
-+         {
-+           CLog::Log(LOGERROR, "%s::%s - mmal_queue_get failed", CLASSNAME, __func__);
-+           return VC_ERROR;
-+         }
-        }
--
-        mmal_buffer_header_reset(buffer);
-        buffer->cmd = 0;
-        buffer->pts = pts == DVD_NOPTS_VALUE ? MMAL_TIME_UNKNOWN : pts;
-@@ -833,11 +842,26 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
- 
-   if (!m_output_ready.empty())
-     ret |= VC_PICTURE;
--  else
-+  if (mmal_queue_length(m_dec_input_pool->queue) > 0)
-     ret |= VC_BUFFER;
- 
-+  bool slept = false;
-+  if (!ret)
-+  {
-+    slept = true;
-+    {
-+      // otherwise we busy spin
-+      CSingleExit unlock(m_sharedSection);
-+      Sleep(10);
-+    }
-+    if (!m_output_ready.empty())
-+      ret |= VC_PICTURE;
-+    if (mmal_queue_length(m_dec_input_pool->queue) > 0)
-+      ret |= VC_BUFFER;
-+  }
-+
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) queued(%.2f) (%.2f:%.2f) full(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) inputs(%d) slept(%d) queued(%.2f) (%.2f:%.2f) full(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), mmal_queue_length(m_dec_input_pool->queue), slept, queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full);
- 
-   return ret;
- }
-
-From 0d9c905db96e1b465a26c834430a1783c000a5a9 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 13 Jul 2015 19:27:25 +0100
-Subject: [PATCH 08/93] Enable QPU based deinterlace and remove resolution
- limit
-
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp         | 2 +-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 5 ++---
- xbmc/cores/omxplayer/OMXVideo.cpp                  | 6 +++---
- 3 files changed, 6 insertions(+), 7 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index 2941d34..bee3af1 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -679,7 +679,7 @@ bool CMMALRenderer::Supports(ESCALINGMETHOD method)
- 
- EINTERLACEMETHOD CMMALRenderer::AutoInterlaceMethod()
- {
--  return VS_INTERLACEMETHOD_MMAL_ADVANCED;
-+  return m_sourceWidth * m_sourceHeight <= 576 * 720 ? VS_INTERLACEMETHOD_MMAL_ADVANCED : VS_INTERLACEMETHOD_MMAL_BOB;
- }
- 
- void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect)
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index f96cc14..0dda9ad 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -363,12 +363,11 @@ bool CMMALVideo::CreateDeinterlace(EINTERLACEMETHOD interlace_method)
-     CLog::Log(LOGERROR, "%s::%s Failed to create deinterlace component (status=%x %s)", CLASSNAME, __func__, status, mmal_status_to_string(status));
-     return false;
-   }
--  bool advanced_deinterlace = (interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED || interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF) &&
--      m_decoded_width * m_decoded_height <= 576 * 720;
-+  bool advanced_deinterlace = interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED || interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF;
-   bool half_framerate = interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF || interlace_method == VS_INTERLACEMETHOD_MMAL_BOB_HALF;
- 
-   MMAL_PARAMETER_IMAGEFX_PARAMETERS_T imfx_param = {{MMAL_PARAMETER_IMAGE_EFFECT_PARAMETERS, sizeof(imfx_param)},
--        advanced_deinterlace ? MMAL_PARAM_IMAGEFX_DEINTERLACE_ADV : MMAL_PARAM_IMAGEFX_DEINTERLACE_FAST, 3, {3, 0, half_framerate }};
-+        advanced_deinterlace ? MMAL_PARAM_IMAGEFX_DEINTERLACE_ADV : MMAL_PARAM_IMAGEFX_DEINTERLACE_FAST, 4, {3, 0, half_framerate, 1 }};
- 
-   status = mmal_port_parameter_set(m_deint->output[0], &imfx_param.hdr);
-   if (status != MMAL_SUCCESS)
-diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
-index e50c13a..20ad4fa 100644
---- a/xbmc/cores/omxplayer/OMXVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXVideo.cpp
-@@ -253,8 +253,7 @@ bool COMXVideo::PortSettingsChanged()
-   if(m_deinterlace)
-   {
-     EINTERLACEMETHOD interlace_method = g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
--    bool advanced_deinterlace = (interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED || interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF) &&
--        port_image.format.video.nFrameWidth * port_image.format.video.nFrameHeight <= 576 * 720;
-+    bool advanced_deinterlace = interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED || interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF;
-     bool half_framerate = interlace_method == VS_INTERLACEMETHOD_MMAL_ADVANCED_HALF || interlace_method == VS_INTERLACEMETHOD_MMAL_BOB_HALF;
-     if (!advanced_deinterlace)
-     {
-@@ -275,10 +274,11 @@ bool COMXVideo::PortSettingsChanged()
-     OMX_INIT_STRUCTURE(image_filter);
- 
-     image_filter.nPortIndex = m_omx_image_fx.GetOutputPort();
--    image_filter.nNumParams = 3;
-+    image_filter.nNumParams = 4;
-     image_filter.nParams[0] = 3;
-     image_filter.nParams[1] = 0;
-     image_filter.nParams[2] = half_framerate;
-+    image_filter.nParams[3] = 1; // qpu
-     if (!advanced_deinterlace)
-       image_filter.eImageFilter = OMX_ImageFilterDeInterlaceFast;
-     else
-
-From 6cfe8e3a2fa86dbb63830eea0b1f9617ea6c9ba0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 16 Aug 2015 15:46:33 +0100
-Subject: [PATCH 09/93] Allow deinterlace with software decode
-
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp | 9 +++++++++
- 1 file changed, 9 insertions(+)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index bee3af1..9b5c666 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -646,6 +646,13 @@ bool CMMALRenderer::Supports(EDEINTERLACEMODE mode)
- 
- bool CMMALRenderer::Supports(EINTERLACEMETHOD method)
- {
-+  if (m_format == RENDER_FMT_YUV420P)
-+  {
-+    if (method == VS_INTERLACEMETHOD_DEINTERLACE_HALF)
-+      return true;
-+    else
-+      return false;
-+  }
-   if (method == VS_INTERLACEMETHOD_AUTO)
-     return true;
-   if (method == VS_INTERLACEMETHOD_MMAL_ADVANCED)
-@@ -679,6 +686,8 @@ bool CMMALRenderer::Supports(ESCALINGMETHOD method)
- 
- EINTERLACEMETHOD CMMALRenderer::AutoInterlaceMethod()
- {
-+  if (m_format == RENDER_FMT_YUV420P)
-+    return VS_INTERLACEMETHOD_DEINTERLACE_HALF;
-   return m_sourceWidth * m_sourceHeight <= 576 * 720 ? VS_INTERLACEMETHOD_MMAL_ADVANCED : VS_INTERLACEMETHOD_MMAL_BOB;
- }
- 
-
-From d5c49bf267a9dd4baf7e6be9127548adf64d899b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 15 Sep 2015 22:26:26 +0100
-Subject: [PATCH 10/93] omxplayer: Don't use AutoInterlaceMethod it changes at
- start of file
-
----
- xbmc/cores/omxplayer/OMXHelper.cpp | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXHelper.cpp b/xbmc/cores/omxplayer/OMXHelper.cpp
-index de493a2..7251fc1 100644
---- a/xbmc/cores/omxplayer/OMXHelper.cpp
-+++ b/xbmc/cores/omxplayer/OMXHelper.cpp
-@@ -130,19 +130,19 @@ bool OMXDoProcessing(struct SOmxPlayerState &m_OmxPlayerState, int m_playSpeed,
-     bool audio_fifo_low = false, video_fifo_low = false, audio_fifo_high = false, video_fifo_high = false;
- 
-     if (m_OmxPlayerState.interlace_method == VS_INTERLACEMETHOD_MAX)
--      m_OmxPlayerState.interlace_method = g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
-+      m_OmxPlayerState.interlace_method = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod;
- 
-     // if deinterlace setting has changed, we should close and open video
-     if (m_OmxPlayerState.current_deinterlace != CMediaSettings::GetInstance().GetCurrentVideoSettings().m_DeinterlaceMode ||
-        (m_OmxPlayerState.current_deinterlace != VS_DEINTERLACEMODE_OFF &&
--        m_OmxPlayerState.interlace_method != g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod)))
-+        m_OmxPlayerState.interlace_method != CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod))
-     {
-       CLog::Log(LOGNOTICE, "%s - Reopen stream due to interlace change (%d,%d,%d,%d)", __FUNCTION__,
-         m_OmxPlayerState.current_deinterlace, CMediaSettings::GetInstance().GetCurrentVideoSettings().m_DeinterlaceMode,
--        m_OmxPlayerState.interlace_method, g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod));
-+        m_OmxPlayerState.interlace_method, CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
- 
-       m_OmxPlayerState.current_deinterlace = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_DeinterlaceMode;
--      m_OmxPlayerState.interlace_method    = g_renderManager.AutoInterlaceMethod(CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod);
-+      m_OmxPlayerState.interlace_method    = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_InterlaceMethod;
-       reopen_stream = true;
-     }
- 
-
-From cb890fdeed45ff016c15f321d00f6cfe9cc3685d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 16 Sep 2015 15:05:37 +0100
-Subject: [PATCH 11/93] Avoid calling render callback with the lock held to
- avoid a deadlock
-
----
- xbmc/cores/omxplayer/OMXVideo.cpp | 27 ++++++++++++++-------------
- xbmc/cores/omxplayer/OMXVideo.h   | 10 +++++++++-
- 2 files changed, 23 insertions(+), 14 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
-index 20ad4fa..eb13e6f 100644
---- a/xbmc/cores/omxplayer/OMXVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXVideo.cpp
-@@ -141,9 +141,8 @@ bool COMXVideo::NaluFormatStartCodes(enum AVCodecID codec, uint8_t *in_extradata
-   return false;    
- }
- 
--bool COMXVideo::PortSettingsChanged()
-+bool COMXVideo::PortSettingsChanged(ResolutionUpdateInfo &resinfo)
- {
--  CSingleLock lock (m_critSection);
-   OMX_ERRORTYPE omx_err   = OMX_ErrorNone;
- 
-   if (m_settings_changed)
-@@ -186,15 +185,13 @@ bool COMXVideo::PortSettingsChanged()
-       port_image.format.video.xFramerate / (float)(1<<16), interlace.eMode, m_deinterlace);
- 
-   // let OMXPlayerVideo know about resolution so it can inform RenderManager
--  if (m_res_callback)
--  {
--    float display_aspect = 0.0f;
--    if (pixel_aspect.nX && pixel_aspect.nY)
--      display_aspect = (float)pixel_aspect.nX * port_image.format.video.nFrameWidth /
--        ((float)pixel_aspect.nY * port_image.format.video.nFrameHeight);
--    m_res_callback(m_res_ctx, port_image.format.video.nFrameWidth, port_image.format.video.nFrameHeight,
--        port_image.format.video.xFramerate / (float)(1<<16), display_aspect);
--  }
-+  resinfo.width = port_image.format.video.nFrameWidth;
-+  resinfo.height = port_image.format.video.nFrameHeight;
-+  resinfo.framerate = port_image.format.video.xFramerate / (float)(1<<16);
-+  resinfo.display_aspect = 0.0f;
-+  resinfo.changed = true;
-+  if (pixel_aspect.nX && pixel_aspect.nY)
-+    resinfo.display_aspect = (float)pixel_aspect.nX * port_image.format.video.nFrameWidth / ((float)pixel_aspect.nY * port_image.format.video.nFrameHeight);
- 
-   if (m_settings_changed)
-   {
-@@ -802,10 +799,11 @@ int COMXVideo::Decode(uint8_t *pData, int iSize, double dts, double pts)
-       }
-       //CLog::Log(LOGINFO, "VideD: dts:%.0f pts:%.0f size:%d)\n", dts, pts, iSize);
- 
-+      ResolutionUpdateInfo resinfo = {};
-       omx_err = m_omx_decoder.WaitForEvent(OMX_EventPortSettingsChanged, 0);
-       if (omx_err == OMX_ErrorNone)
-       {
--        if(!PortSettingsChanged())
-+        if(!PortSettingsChanged(resinfo))
-         {
-           CLog::Log(LOGERROR, "%s::%s - error PortSettingsChanged omx_err(0x%08x)\n", CLASSNAME, __func__, omx_err);
-           return false;
-@@ -814,11 +812,14 @@ int COMXVideo::Decode(uint8_t *pData, int iSize, double dts, double pts)
-       omx_err = m_omx_decoder.WaitForEvent(OMX_EventParamOrConfigChanged, 0);
-       if (omx_err == OMX_ErrorNone)
-       {
--        if(!PortSettingsChanged())
-+        if(!PortSettingsChanged(resinfo))
-         {
-           CLog::Log(LOGERROR, "%s::%s - error PortSettingsChanged (EventParamOrConfigChanged) omx_err(0x%08x)\n", CLASSNAME, __func__, omx_err);
-         }
-       }
-+      lock.Leave();
-+      if (resinfo.changed && m_res_callback)
-+        m_res_callback(m_res_ctx, resinfo.width, resinfo.height, resinfo.framerate, resinfo.display_aspect);
-     }
-     return true;
- 
-diff --git a/xbmc/cores/omxplayer/OMXVideo.h b/xbmc/cores/omxplayer/OMXVideo.h
-index d0634bb..7baefa5 100644
---- a/xbmc/cores/omxplayer/OMXVideo.h
-+++ b/xbmc/cores/omxplayer/OMXVideo.h
-@@ -41,6 +41,14 @@
- 
- typedef void (*ResolutionUpdateCallBackFn)(void *ctx, uint32_t width, uint32_t height, float framerate, float display_aspect);
- 
-+struct ResolutionUpdateInfo {
-+  uint32_t width;
-+  uint32_t height;
-+  float framerate;
-+  float display_aspect;
-+  bool changed;
-+};
-+
- class COMXVideo
- {
- public:
-@@ -50,7 +58,7 @@ public:
-   // Required overrides
-   bool SendDecoderConfig();
-   bool Open(CDVDStreamInfo &hints, OMXClock *clock, EDEINTERLACEMODE deinterlace = VS_DEINTERLACEMODE_OFF, bool hdmi_clock_sync = false);
--  bool PortSettingsChanged();
-+  bool PortSettingsChanged(ResolutionUpdateInfo &resinfo);
-   void RegisterResolutionUpdateCallBack(void *ctx, ResolutionUpdateCallBackFn callback) { m_res_ctx = ctx; m_res_callback = callback; }
-   void Close(void);
-   unsigned int GetFreeSpace();
-
-From 364da740e395d2091293f521a4bde7806b3218a0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 16 Dec 2015 22:03:56 +0000
-Subject: [PATCH 12/93] Add settings option to enable MVC and frame packing
- support
-
----
- .../resource.language.en_gb/resources/strings.po   | 22 ++++++++++++++++++++++
- system/settings/rbp.xml                            | 14 ++++++++++++++
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp |  5 +++++
- xbmc/cores/omxplayer/OMXVideo.cpp                  |  5 +++++
- xbmc/settings/Settings.cpp                         |  2 ++
- xbmc/settings/Settings.h                           |  2 ++
- 6 files changed, 50 insertions(+)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index a697a61..01173ca 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -18110,3 +18110,25 @@ msgstr ""
- msgctxt "#38023"
- msgid "Set my rating"
- msgstr ""
-+
-+#: system/settings/rbp.xml
-+msgctxt "#38027"
-+msgid "Decode the stereo stream from 3D files"
-+msgstr ""
-+
-+#. Description of setting "Decode the stereo stream from 3D files" with label #38027
-+#: system/settings/rbp.xml
-+msgctxt "#38028"
-+msgid "If enabled, videos created in Multiview Video Coding (MVC) format can also be watched in stereoscopic 3D. MVC format is typically found on 3D Blu-rays.[CR]Note: Processing of this data may reduce playback performance, so only enable if you require stereoscopic 3D support."
-+msgstr ""
-+
-+#: system/settings/rbp.xml
-+msgctxt "#38029"
-+msgid "Enable Full HD HDMI modes for stereoscopic 3D"
-+msgstr ""
-+
-+#. Description of setting "Enable Full HD HDMI modes for stereoscopic 3D" with label #38029
-+#: system/settings/rbp.xml
-+msgctxt "#38030"
-+msgid "This option uses frame-packing to output full resolution for 3D through HDMI.[CR]Enabling this improves quality of Multiview Video Coding (MVC) videos, but may not be supported by all displays."
-+msgstr ""
-diff --git a/system/settings/rbp.xml b/system/settings/rbp.xml
-index 50fe36a..7a170c2 100644
---- a/system/settings/rbp.xml
-+++ b/system/settings/rbp.xml
-@@ -27,6 +27,13 @@
-           <control type="edit" format="integer" />
-         </setting>
-       </group>
-+      <group id="3">
-+        <setting id="videoplayer.supportmvc" type="boolean" label="38027" help="38028">
-+          <level>2</level>
-+          <default>true</default>
-+          <control type="toggle" />
-+        </setting>
-+      </group>
-     </category>
-     <category id="myvideos">
-       <group id="1">
-@@ -70,6 +77,13 @@
-           <control type="edit" format="integer" />
-         </setting>
-       </group>
-+      <group id="5">
-+        <setting id="videoscreen.framepacking" type="boolean" label="38029" help="38030">
-+          <level>2</level>
-+          <default>false</default>
-+          <control type="toggle" />
-+        </setting>
-+      </group>
-     </category>
-     <category id="audiooutput">
-       <group id="1">
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 0dda9ad..c09074d 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -551,6 +551,11 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-       // H.264
-       m_codingType = MMAL_ENCODING_H264;
-       m_pFormatName = "mmal-h264";
-+      if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC))
-+      {
-+        m_codingType = MMAL_ENCODING_MVC;
-+        m_pFormatName= "mmal-mvc";
-+      }
-     break;
-     case AV_CODEC_ID_H263:
-     case AV_CODEC_ID_MPEG4:
-diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
-index eb13e6f..ea8c0fc 100644
---- a/xbmc/cores/omxplayer/OMXVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXVideo.cpp
-@@ -427,6 +427,11 @@ bool COMXVideo::Open(CDVDStreamInfo &hints, OMXClock *clock, EDEINTERLACEMODE de
-           break;
-       }
-     }
-+    if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC))
-+    {
-+      m_codingType = OMX_VIDEO_CodingMVC;
-+      m_video_codec_name = "omx-mvc";
-+    }
-     break;
-     case AV_CODEC_ID_MPEG4:
-       // (role name) video_decoder.mpeg4
-diff --git a/xbmc/settings/Settings.cpp b/xbmc/settings/Settings.cpp
-index f50355b..5035cec 100644
---- a/xbmc/settings/Settings.cpp
-+++ b/xbmc/settings/Settings.cpp
-@@ -181,6 +181,7 @@ const std::string CSettings::SETTING_VIDEOPLAYER_USEVDA = "videoplayer.usevda";
- const std::string CSettings::SETTING_VIDEOPLAYER_USEMMAL = "videoplayer.usemmal";
- const std::string CSettings::SETTING_VIDEOPLAYER_USESTAGEFRIGHT = "videoplayer.usestagefright";
- const std::string CSettings::SETTING_VIDEOPLAYER_LIMITGUIUPDATE = "videoplayer.limitguiupdate";
-+const std::string CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC = "videoplayer.supportmvc";
- const std::string CSettings::SETTING_MYVIDEOS_SELECTACTION = "myvideos.selectaction";
- const std::string CSettings::SETTING_MYVIDEOS_EXTRACTFLAGS = "myvideos.extractflags";
- const std::string CSettings::SETTING_MYVIDEOS_EXTRACTCHAPTERTHUMBS = "myvideos.extractchapterthumbs";
-@@ -344,6 +345,7 @@ const std::string CSettings::SETTING_VIDEOSCREEN_VSYNC = "videoscreen.vsync";
- const std::string CSettings::SETTING_VIDEOSCREEN_GUICALIBRATION = "videoscreen.guicalibration";
- const std::string CSettings::SETTING_VIDEOSCREEN_TESTPATTERN = "videoscreen.testpattern";
- const std::string CSettings::SETTING_VIDEOSCREEN_LIMITEDRANGE = "videoscreen.limitedrange";
-+const std::string CSettings::SETTING_VIDEOSCREEN_FRAMEPACKING = "videoscreen.framepacking";
- const std::string CSettings::SETTING_AUDIOOUTPUT_AUDIODEVICE = "audiooutput.audiodevice";
- const std::string CSettings::SETTING_AUDIOOUTPUT_CHANNELS = "audiooutput.channels";
- const std::string CSettings::SETTING_AUDIOOUTPUT_CONFIG = "audiooutput.config";
-diff --git a/xbmc/settings/Settings.h b/xbmc/settings/Settings.h
-index 55e150d..f3ba426 100644
---- a/xbmc/settings/Settings.h
-+++ b/xbmc/settings/Settings.h
-@@ -137,6 +137,7 @@ public:
-   static const std::string SETTING_VIDEOPLAYER_USEMMAL;
-   static const std::string SETTING_VIDEOPLAYER_USESTAGEFRIGHT;
-   static const std::string SETTING_VIDEOPLAYER_LIMITGUIUPDATE;
-+  static const std::string SETTING_VIDEOPLAYER_SUPPORTMVC;
-   static const std::string SETTING_MYVIDEOS_SELECTACTION;
-   static const std::string SETTING_MYVIDEOS_EXTRACTFLAGS;
-   static const std::string SETTING_MYVIDEOS_EXTRACTCHAPTERTHUMBS;
-@@ -300,6 +301,7 @@ public:
-   static const std::string SETTING_VIDEOSCREEN_GUICALIBRATION;
-   static const std::string SETTING_VIDEOSCREEN_TESTPATTERN;
-   static const std::string SETTING_VIDEOSCREEN_LIMITEDRANGE;
-+  static const std::string SETTING_VIDEOSCREEN_FRAMEPACKING;
-   static const std::string SETTING_AUDIOOUTPUT_AUDIODEVICE;
-   static const std::string SETTING_AUDIOOUTPUT_CHANNELS;
-   static const std::string SETTING_AUDIOOUTPUT_CONFIG;
-
-From 71d3daeb3f44c6a7876415141e740464ce8b6c87 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 17 Dec 2015 15:38:34 +0000
-Subject: [PATCH 13/93] Don't adjust 3d rectangles in bypass mode
-
----
- xbmc/cores/VideoRenderers/BaseRenderer.cpp | 55 ++++++++++++++++--------------
- 1 file changed, 29 insertions(+), 26 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/BaseRenderer.cpp b/xbmc/cores/VideoRenderers/BaseRenderer.cpp
-index 7889cf8..d4bb306 100644
---- a/xbmc/cores/VideoRenderers/BaseRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/BaseRenderer.cpp
-@@ -673,35 +673,38 @@ void CBaseRenderer::ManageDisplay()
-     else if(stereo_view == RENDER_STEREO_VIEW_RIGHT) stereo_view = RENDER_STEREO_VIEW_LEFT;
-   }
- 
--  switch(stereo_mode)
-+  if (m_format != RENDER_FMT_BYPASS)
-   {
--    case CONF_FLAGS_STEREO_MODE_TAB:
--      // Those are flipped in y
--      if (m_format == RENDER_FMT_CVBREF || m_format == RENDER_FMT_MEDIACODEC)
--      {
--        if (stereo_view == RENDER_STEREO_VIEW_LEFT)
--          m_sourceRect.y1 += m_sourceRect.y2*0.5f;
--        else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
--          m_sourceRect.y2 *= 0.5f;
--      }
--      else
--      {
--        if (stereo_view == RENDER_STEREO_VIEW_LEFT)
--          m_sourceRect.y2 *= 0.5f;
--        else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
--          m_sourceRect.y1 += m_sourceRect.y2*0.5f;
--      }
--      break;
-+    switch(stereo_mode)
-+    {
-+      case CONF_FLAGS_STEREO_MODE_TAB:
-+        // Those are flipped in y
-+        if (m_format == RENDER_FMT_CVBREF || m_format == RENDER_FMT_MEDIACODEC)
-+        {
-+          if (stereo_view == RENDER_STEREO_VIEW_LEFT)
-+            m_sourceRect.y1 += m_sourceRect.y2*0.5f;
-+          else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
-+            m_sourceRect.y2 *= 0.5f;
-+        }
-+        else
-+        {
-+          if (stereo_view == RENDER_STEREO_VIEW_LEFT)
-+            m_sourceRect.y2 *= 0.5f;
-+          else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
-+            m_sourceRect.y1 += m_sourceRect.y2*0.5f;
-+        }
-+        break;
- 
--    case CONF_FLAGS_STEREO_MODE_SBS:
--      if     (stereo_view == RENDER_STEREO_VIEW_LEFT)
--        m_sourceRect.x2 *= 0.5f;
--      else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
--        m_sourceRect.x1 += m_sourceRect.x2*0.5f;
--      break;
-+      case CONF_FLAGS_STEREO_MODE_SBS:
-+        if     (stereo_view == RENDER_STEREO_VIEW_LEFT)
-+          m_sourceRect.x2 *= 0.5f;
-+        else if(stereo_view == RENDER_STEREO_VIEW_RIGHT)
-+          m_sourceRect.x1 += m_sourceRect.x2*0.5f;
-+        break;
- 
--    default:
--      break;
-+      default:
-+        break;
-+    }
-   }
- 
-   CalcNormalDisplayRect(m_viewRect.x1, m_viewRect.y1, m_viewRect.Width(), m_viewRect.Height(), GetAspectRatio() * CDisplaySettings::GetInstance().GetPixelRatio(), CDisplaySettings::GetInstance().GetZoomAmount(), CDisplaySettings::GetInstance().GetVerticalShift());
-
-From 5ebb280be9de4ce882de665215c8bbda0c072864 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 14 Mar 2015 12:38:08 +0000
-Subject: [PATCH 14/93] Switch to using transform flags for 3d modes
-
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp      | 100 +++++++++---------------
- xbmc/cores/omxplayer/OMXPlayerVideo.cpp         |  83 +++++---------------
- xbmc/cores/omxplayer/OMXVideo.cpp               |  36 ++++-----
- xbmc/cores/omxplayer/OMXVideo.h                 |   2 +-
- xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp |  11 ++-
- 5 files changed, 79 insertions(+), 153 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index 9b5c666..2dff194 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -452,11 +452,7 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-     return;
-   }
- 
--  if (g_graphicsContext.GetStereoMode())
--    g_graphicsContext.SetStereoView(RENDER_STEREO_VIEW_LEFT);
-   ManageDisplay();
--  if (g_graphicsContext.GetStereoMode())
--    g_graphicsContext.SetStereoView(RENDER_STEREO_VIEW_OFF);
- 
-   // if running bypass, then the player might need the src/dst rects
-   // for sizing video playback on a layer other than the gles layer.
-@@ -693,10 +689,8 @@ EINTERLACEMETHOD CMMALRenderer::AutoInterlaceMethod()
- 
- void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect)
- {
--  // we get called twice a frame for left/right. Can ignore the rights.
--  if (g_graphicsContext.GetStereoView() == RENDER_STEREO_VIEW_RIGHT)
--    return;
-   CSingleLock lock(m_sharedSection);
-+  assert(g_graphicsContext.GetStereoView() != RENDER_STEREO_VIEW_RIGHT);
- 
-   if (!m_vout_input)
-     return;
-@@ -707,6 +701,10 @@ void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect
-   bool stereo_invert                   = (m_iFlags & CONF_FLAGS_STEREO_CADANCE_RIGHT_LEFT) ? true : false;
-   RENDER_STEREO_MODE display_stereo_mode = g_graphicsContext.GetStereoMode();
- 
-+  // ignore video stereo mode when 3D display mode is disabled
-+  if (display_stereo_mode == RENDER_STEREO_MODE_OFF)
-+    video_stereo_mode = RENDER_STEREO_MODE_OFF;
-+
-   // fix up transposed video
-   if (m_renderOrientation == 90 || m_renderOrientation == 270)
-   {
-@@ -738,40 +736,17 @@ void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect
-   CRect gui(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iHeight);
-   CRect display(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenHeight);
- 
--  if (display_stereo_mode != RENDER_STEREO_MODE_OFF && display_stereo_mode != RENDER_STEREO_MODE_MONO)
--  switch (video_stereo_mode)
-+  if (display_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-   {
--  case RENDER_STEREO_MODE_SPLIT_VERTICAL:
--    // optimisation - use simpler display mode in common case of unscaled 3d with same display mode
--    if (video_stereo_mode == display_stereo_mode && DestRect.x1 == 0.0f && DestRect.x2 * 2.0f == gui.Width() && !stereo_invert)
--    {
--      SrcRect.x2 *= 2.0f;
--      DestRect.x2 *= 2.0f;
--      video_stereo_mode = RENDER_STEREO_MODE_OFF;
--      display_stereo_mode = RENDER_STEREO_MODE_OFF;
--    }
--    else if (display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_RED_CYAN || display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_GREEN_MAGENTA || display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_YELLOW_BLUE)
--    {
--      SrcRect.x2 *= 2.0f;
--    }
--    break;
--
--  case RENDER_STEREO_MODE_SPLIT_HORIZONTAL:
--    // optimisation - use simpler display mode in common case of unscaled 3d with same display mode
--    if (video_stereo_mode == display_stereo_mode && DestRect.y1 == 0.0f && DestRect.y2 * 2.0f == gui.Height() && !stereo_invert)
--    {
--      SrcRect.y2 *= 2.0f;
--      DestRect.y2 *= 2.0f;
--      video_stereo_mode = RENDER_STEREO_MODE_OFF;
--      display_stereo_mode = RENDER_STEREO_MODE_OFF;
--    }
--    else if (display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_RED_CYAN || display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_GREEN_MAGENTA || display_stereo_mode == RENDER_STEREO_MODE_ANAGLYPH_YELLOW_BLUE)
--    {
--      SrcRect.y2 *= 2.0f;
--    }
--    break;
--
--  default: break;
-+    float width = DestRect.x2 - DestRect.x1;
-+    DestRect.x1 *= 2.0f;
-+    DestRect.x2 = DestRect.x1 + 2.0f * width;
-+  }
-+  else if (display_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+  {
-+    float height = DestRect.y2 - DestRect.y1;
-+    DestRect.y1 *= 2.0f;
-+    DestRect.y2 = DestRect.y1 + 2.0f * height;
-   }
- 
-   if (gui != display)
-@@ -787,7 +762,7 @@ void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect
-   MMAL_DISPLAYREGION_T region;
-   memset(&region, 0, sizeof region);
- 
--  region.set                 = MMAL_DISPLAY_SET_DEST_RECT|MMAL_DISPLAY_SET_SRC_RECT|MMAL_DISPLAY_SET_FULLSCREEN|MMAL_DISPLAY_SET_NOASPECT|MMAL_DISPLAY_SET_MODE;
-+  region.set                 = MMAL_DISPLAY_SET_DEST_RECT|MMAL_DISPLAY_SET_SRC_RECT|MMAL_DISPLAY_SET_FULLSCREEN|MMAL_DISPLAY_SET_NOASPECT|MMAL_DISPLAY_SET_MODE|MMAL_DISPLAY_SET_TRANSFORM;
-   region.dest_rect.x         = lrintf(DestRect.x1);
-   region.dest_rect.y         = lrintf(DestRect.y1);
-   region.dest_rect.width     = lrintf(DestRect.Width());
-@@ -800,35 +775,32 @@ void CMMALRenderer::SetVideoRect(const CRect& InSrcRect, const CRect& InDestRect
- 
-   region.fullscreen = MMAL_FALSE;
-   region.noaspect = MMAL_TRUE;
-+  region.mode = MMAL_DISPLAY_MODE_LETTERBOX;
-+
-+  if (m_renderOrientation == 90)
-+    region.transform = MMAL_DISPLAY_ROT90;
-+  else if (m_renderOrientation == 180)
-+    region.transform = MMAL_DISPLAY_ROT180;
-+  else if (m_renderOrientation == 270)
-+    region.transform = MMAL_DISPLAY_ROT270;
-+  else
-+    region.transform = MMAL_DISPLAY_ROT0;
- 
--  if (m_renderOrientation)
--  {
--    region.set |= MMAL_DISPLAY_SET_TRANSFORM;
--    if (m_renderOrientation == 90)
--      region.transform = MMAL_DISPLAY_ROT90;
--    else if (m_renderOrientation == 180)
--      region.transform = MMAL_DISPLAY_ROT180;
--    else if (m_renderOrientation == 270)
--      region.transform = MMAL_DISPLAY_ROT270;
--    else assert(0);
--  }
--
--  if (video_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL && display_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    region.mode = MMAL_DISPLAY_MODE_STEREO_TOP_TO_TOP;
--  else if (video_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL && display_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    region.mode = MMAL_DISPLAY_MODE_STEREO_TOP_TO_LEFT;
--  else if (video_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL && display_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    region.mode = MMAL_DISPLAY_MODE_STEREO_LEFT_TO_TOP;
--  else if (video_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL && display_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    region.mode = MMAL_DISPLAY_MODE_STEREO_LEFT_TO_LEFT;
-+  if (m_video_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+    region.transform = (MMAL_DISPLAYTRANSFORM_T)(region.transform | DISPMANX_STEREOSCOPIC_TB);
-+  else if (m_video_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-+    region.transform = (MMAL_DISPLAYTRANSFORM_T)(region.transform | DISPMANX_STEREOSCOPIC_SBS);
-   else
--    region.mode = MMAL_DISPLAY_MODE_LETTERBOX;
-+    region.transform = (MMAL_DISPLAYTRANSFORM_T)(region.transform | DISPMANX_STEREOSCOPIC_MONO);
-+
-+  if (m_StereoInvert)
-+    region.transform = (MMAL_DISPLAYTRANSFORM_T)(region.transform | DISPMANX_STEREOSCOPIC_INVERT);
- 
-   MMAL_STATUS_T status = mmal_util_set_display_region(m_vout_input, &region);
-   if (status != MMAL_SUCCESS)
-     CLog::Log(LOGERROR, "%s::%s Failed to set display region (status=%x %s)", CLASSNAME, __func__, status, mmal_status_to_string(status));
- 
--  CLog::Log(LOGDEBUG, "%s::%s %d,%d,%d,%d -> %d,%d,%d,%d mode:%d", CLASSNAME, __func__,
-+  CLog::Log(LOGDEBUG, "%s::%s %d,%d,%d,%d -> %d,%d,%d,%d t:%x", CLASSNAME, __func__,
-       region.src_rect.x, region.src_rect.y, region.src_rect.width, region.src_rect.height,
--      region.dest_rect.x, region.dest_rect.y, region.dest_rect.width, region.dest_rect.height, region.mode);
-+      region.dest_rect.x, region.dest_rect.y, region.dest_rect.width, region.dest_rect.height, region.transform);
- }
-diff --git a/xbmc/cores/omxplayer/OMXPlayerVideo.cpp b/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-index 0e78de4..0e04360 100644
---- a/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-@@ -650,10 +650,6 @@ int OMXPlayerVideo::GetFreeSpace()
- 
- void OMXPlayerVideo::SetVideoRect(const CRect &InSrcRect, const CRect &InDestRect)
- {
--  // we get called twice a frame for left/right. Can ignore the rights.
--  if (g_graphicsContext.GetStereoView() == RENDER_STEREO_VIEW_RIGHT)
--    return;
--
-   CRect SrcRect = InSrcRect, DestRect = InDestRect;
-   unsigned flags = GetStereoModeFlags(GetStereoMode());
-   RENDER_STEREO_MODE video_stereo_mode = (flags & CONF_FLAGS_STEREO_MODE_SBS) ? RENDER_STEREO_MODE_SPLIT_VERTICAL :
-@@ -661,6 +657,10 @@ void OMXPlayerVideo::SetVideoRect(const CRect &InSrcRect, const CRect &InDestRec
-   bool stereo_invert                   = (flags & CONF_FLAGS_STEREO_CADANCE_RIGHT_LEFT) ? true : false;
-   RENDER_STEREO_MODE display_stereo_mode = g_graphicsContext.GetStereoMode();
- 
-+  // ignore video stereo mode when 3D display mode is disabled
-+  if (display_stereo_mode == RENDER_STEREO_MODE_OFF)
-+    video_stereo_mode = RENDER_STEREO_MODE_OFF;
-+
-   // fix up transposed video
-   if (m_hints.orientation == 90 || m_hints.orientation == 270)
-   {
-@@ -692,41 +692,17 @@ void OMXPlayerVideo::SetVideoRect(const CRect &InSrcRect, const CRect &InDestRec
-   CRect gui(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iHeight);
-   CRect display(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenHeight);
- 
--  switch (video_stereo_mode)
-+  if (display_stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-   {
--  case RENDER_STEREO_MODE_SPLIT_VERTICAL:
--    // optimisation - use simpler display mode in common case of unscaled 3d with same display mode
--    if (video_stereo_mode == display_stereo_mode && DestRect.x1 == 0.0f && DestRect.x2 * 2.0f == gui.Width() && !stereo_invert)
--    {
--      SrcRect.x2 *= 2.0f;
--      DestRect.x2 *= 2.0f;
--      video_stereo_mode = RENDER_STEREO_MODE_OFF;
--      display_stereo_mode = RENDER_STEREO_MODE_OFF;
--    }
--    else if (stereo_invert)
--    {
--      SrcRect.x1 += m_hints.width / 2;
--      SrcRect.x2 += m_hints.width / 2;
--    }
--    break;
--
--  case RENDER_STEREO_MODE_SPLIT_HORIZONTAL:
--    // optimisation - use simpler display mode in common case of unscaled 3d with same display mode
--    if (video_stereo_mode == display_stereo_mode && DestRect.y1 == 0.0f && DestRect.y2 * 2.0f == gui.Height() && !stereo_invert)
--    {
--      SrcRect.y2 *= 2.0f;
--      DestRect.y2 *= 2.0f;
--      video_stereo_mode = RENDER_STEREO_MODE_OFF;
--      display_stereo_mode = RENDER_STEREO_MODE_OFF;
--    }
--    else if (stereo_invert)
--    {
--      SrcRect.y1 += m_hints.height / 2;
--      SrcRect.y2 += m_hints.height / 2;
--    }
--    break;
--
--  default: break;
-+    float width = DestRect.x2 - DestRect.x1;
-+    DestRect.x1 *= 2.0f;
-+    DestRect.x2 = DestRect.x1 + 2.0f * width;
-+  }
-+  else if (display_stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+  {
-+    float height = DestRect.y2 - DestRect.y1;
-+    DestRect.y1 *= 2.0f;
-+    DestRect.y2 = DestRect.y1 + 2.0f * height;
-   }
- 
-   if (gui != display)
-@@ -738,7 +714,7 @@ void OMXPlayerVideo::SetVideoRect(const CRect &InSrcRect, const CRect &InDestRec
-     DestRect.y1 *= yscale;
-     DestRect.y2 *= yscale;
-   }
--  m_omxVideo.SetVideoRect(SrcRect, DestRect, video_stereo_mode, display_stereo_mode);
-+  m_omxVideo.SetVideoRect(SrcRect, DestRect, m_video_stereo_mode, m_display_stereo_mode, m_StereoInvert);
- }
- 
- void OMXPlayerVideo::RenderUpdateCallBack(const void *ctx, const CRect &SrcRect, const CRect &DestRect)
-@@ -753,40 +729,17 @@ void OMXPlayerVideo::ResolutionUpdateCallBack(uint32_t width, uint32_t height, f
-   uint32_t video_width   = CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenWidth;
-   uint32_t video_height  = CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenHeight;
- 
--  unsigned flags = 0;
-   ERenderFormat format = RENDER_FMT_BYPASS;
- 
-+  /* figure out steremode expected based on user settings and hints */
-+  unsigned flags = GetStereoModeFlags(GetStereoMode());
-+
-   if(m_bAllowFullscreen)
-   {
-     flags |= CONF_FLAGS_FULLSCREEN;
-     m_bAllowFullscreen = false; // only allow on first configure
-   }
- 
--  flags |= GetStereoModeFlags(GetStereoMode());
--
--  if(flags & CONF_FLAGS_STEREO_MODE_SBS)
--  {
--    if(g_Windowing.Support3D(video_width, video_height, D3DPRESENTFLAG_MODE3DSBS))
--      CLog::Log(LOGNOTICE, "3DSBS movie found");
--    else
--    {
--      flags &= ~CONF_FLAGS_STEREO_MODE_MASK(~0);
--      CLog::Log(LOGNOTICE, "3DSBS movie found but not supported");
--    }
--  }
--  else if(flags & CONF_FLAGS_STEREO_MODE_TAB)
--  {
--    if(g_Windowing.Support3D(video_width, video_height, D3DPRESENTFLAG_MODE3DTB))
--      CLog::Log(LOGNOTICE, "3DTB movie found");
--    else
--    {
--      flags &= ~CONF_FLAGS_STEREO_MODE_MASK(~0);
--      CLog::Log(LOGNOTICE, "3DTB movie found but not supported");
--    }
--  }
--  else
--    CLog::Log(LOGNOTICE, "not a 3D movie");
--
-   unsigned int iDisplayWidth  = width;
-   unsigned int iDisplayHeight = height;
- 
-diff --git a/xbmc/cores/omxplayer/OMXVideo.cpp b/xbmc/cores/omxplayer/OMXVideo.cpp
-index ea8c0fc..a9825a0 100644
---- a/xbmc/cores/omxplayer/OMXVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXVideo.cpp
-@@ -217,15 +217,6 @@ bool COMXVideo::PortSettingsChanged(ResolutionUpdateInfo &resinfo)
-   OMX_INIT_STRUCTURE(configDisplay);
-   configDisplay.nPortIndex = m_omx_render.GetInputPort();
- 
--  configDisplay.set = OMX_DISPLAY_SET_TRANSFORM;
--  configDisplay.transform = m_transform;
--  omx_err = m_omx_render.SetConfig(OMX_IndexConfigDisplayRegion, &configDisplay);
--  if(omx_err != OMX_ErrorNone)
--  {
--    CLog::Log(LOGWARNING, "%s::%s - could not set transform : %d", CLASSNAME, __func__, m_transform);
--    return false;
--  }
--
-   if(m_hdmi_clock_sync)
-   {
-     OMX_CONFIG_LATENCYTARGETTYPE latencyTarget;
-@@ -847,7 +838,7 @@ void COMXVideo::Reset(void)
- }
- 
- ///////////////////////////////////////////////////////////////////////////////////////////
--void COMXVideo::SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER_STEREO_MODE video_mode, RENDER_STEREO_MODE display_mode)
-+void COMXVideo::SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER_STEREO_MODE video_mode, RENDER_STEREO_MODE display_mode, bool stereo_invert)
- {
-   CSingleLock lock (m_critSection);
-   if(!m_is_open)
-@@ -857,7 +848,7 @@ void COMXVideo::SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER
- 
-   OMX_INIT_STRUCTURE(configDisplay);
-   configDisplay.nPortIndex = m_omx_render.GetInputPort();
--  configDisplay.set                 = (OMX_DISPLAYSETTYPE)(OMX_DISPLAY_SET_DEST_RECT|OMX_DISPLAY_SET_SRC_RECT|OMX_DISPLAY_SET_FULLSCREEN|OMX_DISPLAY_SET_NOASPECT|OMX_DISPLAY_SET_MODE);
-+  configDisplay.set                 = (OMX_DISPLAYSETTYPE)(OMX_DISPLAY_SET_DEST_RECT|OMX_DISPLAY_SET_SRC_RECT|OMX_DISPLAY_SET_FULLSCREEN|OMX_DISPLAY_SET_NOASPECT|OMX_DISPLAY_SET_MODE|OMX_DISPLAY_SET_TRANSFORM);
-   configDisplay.dest_rect.x_offset  = lrintf(DestRect.x1);
-   configDisplay.dest_rect.y_offset  = lrintf(DestRect.y1);
-   configDisplay.dest_rect.width     = lrintf(DestRect.Width());
-@@ -870,23 +861,24 @@ void COMXVideo::SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER
- 
-   configDisplay.fullscreen = OMX_FALSE;
-   configDisplay.noaspect = OMX_TRUE;
-+  configDisplay.mode = OMX_DISPLAY_MODE_LETTERBOX;
-+  configDisplay.transform = m_transform;
- 
--  if (video_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL && display_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    configDisplay.mode = OMX_DISPLAY_MODE_STEREO_TOP_TO_TOP;
--  else if (video_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL && display_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    configDisplay.mode = OMX_DISPLAY_MODE_STEREO_TOP_TO_LEFT;
--  else if (video_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL && display_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    configDisplay.mode = OMX_DISPLAY_MODE_STEREO_LEFT_TO_TOP;
--  else if (video_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL && display_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    configDisplay.mode = OMX_DISPLAY_MODE_STEREO_LEFT_TO_LEFT;
-+  if (video_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+    configDisplay.transform = (OMX_DISPLAYTRANSFORMTYPE)(configDisplay.transform | DISPMANX_STEREOSCOPIC_TB);
-+  else if (video_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-+    configDisplay.transform = (OMX_DISPLAYTRANSFORMTYPE)(configDisplay.transform | DISPMANX_STEREOSCOPIC_SBS);
-   else
--    configDisplay.mode = OMX_DISPLAY_MODE_LETTERBOX;
-+    configDisplay.transform = (OMX_DISPLAYTRANSFORMTYPE)(configDisplay.transform | DISPMANX_STEREOSCOPIC_MONO);
-+
-+  if (stereo_invert)
-+    configDisplay.transform = (OMX_DISPLAYTRANSFORMTYPE)(configDisplay.transform | DISPMANX_STEREOSCOPIC_INVERT);
- 
-   m_omx_render.SetConfig(OMX_IndexConfigDisplayRegion, &configDisplay);
- 
--  CLog::Log(LOGDEBUG, "%s::%s %d,%d,%d,%d -> %d,%d,%d,%d mode:%d", CLASSNAME, __func__,
-+  CLog::Log(LOGDEBUG, "%s::%s %d,%d,%d,%d -> %d,%d,%d,%d t:%x", CLASSNAME, __func__,
-       configDisplay.src_rect.x_offset, configDisplay.src_rect.y_offset, configDisplay.src_rect.width, configDisplay.src_rect.height,
--      configDisplay.dest_rect.x_offset, configDisplay.dest_rect.y_offset, configDisplay.dest_rect.width, configDisplay.dest_rect.height, configDisplay.mode);
-+      configDisplay.dest_rect.x_offset, configDisplay.dest_rect.y_offset, configDisplay.dest_rect.width, configDisplay.dest_rect.height, configDisplay.transform);
- }
- 
- int COMXVideo::GetInputBufferSize()
-diff --git a/xbmc/cores/omxplayer/OMXVideo.h b/xbmc/cores/omxplayer/OMXVideo.h
-index 7baefa5..31982b4 100644
---- a/xbmc/cores/omxplayer/OMXVideo.h
-+++ b/xbmc/cores/omxplayer/OMXVideo.h
-@@ -67,7 +67,7 @@ public:
-   void Reset(void);
-   void SetDropState(bool bDrop);
-   std::string GetDecoderName() { return m_video_codec_name; };
--  void SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER_STEREO_MODE video_mode, RENDER_STEREO_MODE display_mode);
-+  void SetVideoRect(const CRect& SrcRect, const CRect& DestRect, RENDER_STEREO_MODE video_mode, RENDER_STEREO_MODE display_mode, bool stereo_invert);
-   int GetInputBufferSize();
-   bool GetPlayerInfo(double &match, double &phase, double &pll);
-   void SubmitEOS();
-diff --git a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-index 5d5b74b..443d037 100644
---- a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-+++ b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-@@ -234,7 +234,9 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
-       /* inform TV of any 3D settings. Note this property just applies to next hdmi mode change, so no need to call for 2D modes */
-       HDMI_PROPERTY_PARAM_T property;
-       property.property = HDMI_PROPERTY_3D_STRUCTURE;
--      if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-+      if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOSCREEN_FRAMEPACKING) && CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC) && res.fRefreshRate <= 30.0f)
-+        property.param1 = HDMI_3D_FORMAT_FRAME_PACKING;
-+      else if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-         property.param1 = HDMI_3D_FORMAT_SBS_HALF;
-       else if (res.dwFlags & D3DPRESENTFLAG_MODE3DTB)
-         property.param1 = HDMI_3D_FORMAT_TB_HALF;
-@@ -334,6 +336,13 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
-   DISPMANX_TRANSFORM_T transform = DISPMANX_NO_ROTATE;
-   DISPMANX_UPDATE_HANDLE_T dispman_update = m_DllBcmHost->vc_dispmanx_update_start(0);
- 
-+  if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-+    transform = DISPMANX_STEREOSCOPIC_SBS;
-+  else if (res.dwFlags & D3DPRESENTFLAG_MODE3DTB)
-+    transform = DISPMANX_STEREOSCOPIC_TB;
-+  else
-+    transform = DISPMANX_STEREOSCOPIC_MONO;
-+
-   CLog::Log(LOGDEBUG, "EGL set resolution %dx%d -> %dx%d @ %.2f fps (%d,%d) flags:%x aspect:%.2f\n",
-       m_width, m_height, dst_rect.width, dst_rect.height, res.fRefreshRate, GETFLAGS_GROUP(res.dwFlags), GETFLAGS_MODE(res.dwFlags), (int)res.dwFlags, res.fPixelRatio);
- 
-
-From 2be3612226ee01a6d294c6ca6a7d8d0849bd4221 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 21 Jun 2015 17:42:03 +0100
-Subject: [PATCH 15/93] Remove unused Support3D function
-
----
- xbmc/windowing/egl/WinSystemEGL.cpp | 29 -----------------------------
- xbmc/windowing/egl/WinSystemEGL.h   |  1 -
- 2 files changed, 30 deletions(-)
-
-diff --git a/xbmc/windowing/egl/WinSystemEGL.cpp b/xbmc/windowing/egl/WinSystemEGL.cpp
-index 696ded1..718fb4c 100644
---- a/xbmc/windowing/egl/WinSystemEGL.cpp
-+++ b/xbmc/windowing/egl/WinSystemEGL.cpp
-@@ -531,35 +531,6 @@ EGLConfig CWinSystemEGL::GetEGLConfig()
-   return m_config;
- }
- 
--// the logic in this function should match whether CBaseRenderer::FindClosestResolution picks a 3D mode
--bool CWinSystemEGL::Support3D(int width, int height, uint32_t mode) const
--{
--  RESOLUTION_INFO &curr = CDisplaySettings::GetInstance().GetResolutionInfo(g_graphicsContext.GetVideoResolution());
--
--  // if we are using automatic hdmi mode switching
--  if (CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_ADJUSTREFRESHRATE) != ADJUST_REFRESHRATE_OFF)
--  {
--    int searchWidth = curr.iScreenWidth;
--    int searchHeight = curr.iScreenHeight;
--
--    // only search the custom resolutions
--    for (unsigned int i = (int)RES_DESKTOP; i < CDisplaySettings::GetInstance().ResolutionInfoSize(); i++)
--    {
--      RESOLUTION_INFO res = CDisplaySettings::GetInstance().GetResolutionInfo(i);
--      if(res.iScreenWidth == searchWidth && res.iScreenHeight == searchHeight && (res.dwFlags & mode))
--        return true;
--    }
--  }
--  // otherwise just consider current mode
--  else
--  {
--     if (curr.dwFlags & mode)
--       return true;
--  }
--
--  return false;
--}
--
- bool CWinSystemEGL::ClampToGUIDisplayLimits(int &width, int &height)
- {
-   width = width > m_nWidth ? m_nWidth : width;
-diff --git a/xbmc/windowing/egl/WinSystemEGL.h b/xbmc/windowing/egl/WinSystemEGL.h
-index 9d4baf6..1ec4225 100644
---- a/xbmc/windowing/egl/WinSystemEGL.h
-+++ b/xbmc/windowing/egl/WinSystemEGL.h
-@@ -59,7 +59,6 @@ public:
-   virtual void  Register(IDispResource *resource);
-   virtual void  Unregister(IDispResource *resource);
- 
--  virtual bool  Support3D(int width, int height, uint32_t mode)     const;
-   virtual bool  ClampToGUIDisplayLimits(int &width, int &height);
- 
-   EGLConfig     GetEGLConfig();
-
-From ad81921b2e03b01bed2d40f0f1aff697cb48fa56 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 16 Dec 2015 21:08:34 +0000
-Subject: [PATCH 16/93] Avoid switching stereo mode based on current display
- mode
-
----
- xbmc/guilib/GraphicContext.cpp | 24 ------------------------
- 1 file changed, 24 deletions(-)
-
-diff --git a/xbmc/guilib/GraphicContext.cpp b/xbmc/guilib/GraphicContext.cpp
-index a1b8812..60979bb 100644
---- a/xbmc/guilib/GraphicContext.cpp
-+++ b/xbmc/guilib/GraphicContext.cpp
-@@ -429,30 +429,6 @@ void CGraphicContext::SetVideoResolutionInternal(RESOLUTION res, bool forceUpdat
-   Lock();
- 
-   RESOLUTION_INFO info_org  = CDisplaySettings::GetInstance().GetResolutionInfo(res);
--  RESOLUTION_INFO info_last = CDisplaySettings::GetInstance().GetResolutionInfo(lastRes);
--
--  RENDER_STEREO_MODE stereo_mode = m_stereoMode;
--
--  // if the new resolution is an actual stereo mode, switch to that
--  // if the old resolution was an actual stereo mode and renderer is still in old 3D mode, switch to no 3d mode
--  if (info_org.dwFlags & D3DPRESENTFLAG_MODE3DTB)
--    stereo_mode = RENDER_STEREO_MODE_SPLIT_HORIZONTAL;
--  else if (info_org.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
--    stereo_mode = RENDER_STEREO_MODE_SPLIT_VERTICAL;
--  else if ((info_last.dwFlags & D3DPRESENTFLAG_MODE3DTB)
--        && m_stereoMode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
--    stereo_mode = RENDER_STEREO_MODE_OFF;
--  else if ((info_last.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
--        && m_stereoMode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
--    stereo_mode = RENDER_STEREO_MODE_OFF;
--
--  if(stereo_mode != m_stereoMode)
--  {
--    m_stereoView     = RENDER_STEREO_VIEW_OFF;
--    m_stereoMode     = stereo_mode;
--    m_nextStereoMode = stereo_mode;
--    CSettings::GetInstance().SetInt(CSettings::SETTING_VIDEOSCREEN_STEREOSCOPICMODE, (int)m_stereoMode);
--  }
- 
-   RESOLUTION_INFO info_mod = GetResInfo(res);
- 
-
-From be69b44990015a874305ef96e7fbdef7f815599e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 21 Jun 2015 18:53:29 +0100
-Subject: [PATCH 17/93] Drop reporting 3D modes and just use current rendering
- mode to request 3D signalling
-
-[rbp] Add ntsc version of 48Hz mode
-
-[rbp] Extract the correct resolution with Pi LCD
-
-[rpb] Change order or CEA and DMT mode probing so CEA modes are preferred
-
-[rbp] Allow interlaced resolutions into supported hdmi mode list 2
----
- xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp | 119 +++++++++---------------
- xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h   |   4 +-
- 2 files changed, 47 insertions(+), 76 deletions(-)
-
-diff --git a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-index 443d037..ee29770 100644
---- a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-+++ b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-@@ -28,6 +28,9 @@
- #include "linux/RBP.h"
- #include "utils/StringUtils.h"
- #include "settings/Settings.h"
-+#include "guilib/GraphicContext.h"
-+#include "guilib/StereoscopicsManager.h"
-+#include "rendering/RenderSystem.h"
- #include <cassert>
- 
- #ifndef __VIDEOCORE4__
-@@ -185,12 +188,13 @@ bool CEGLNativeTypeRaspberryPI::GetNativeResolution(RESOLUTION_INFO *res) const
- }
- 
- #if defined(TARGET_RASPBERRY_PI)
--int CEGLNativeTypeRaspberryPI::FindMatchingResolution(const RESOLUTION_INFO &res, const std::vector<RESOLUTION_INFO> &resolutions)
-+int CEGLNativeTypeRaspberryPI::FindMatchingResolution(const RESOLUTION_INFO &res, const std::vector<RESOLUTION_INFO> &resolutions, bool desktop)
- {
-+  uint32_t mask = desktop ? D3DPRESENTFLAG_MODEMASK : D3DPRESENTFLAG_MODE3DSBS|D3DPRESENTFLAG_MODE3DTB;
-   for (int i = 0; i < (int)resolutions.size(); i++)
-   {
-     if(resolutions[i].iScreenWidth == res.iScreenWidth && resolutions[i].iScreenHeight == res.iScreenHeight && resolutions[i].fRefreshRate == res.fRefreshRate &&
--      (resolutions[i].dwFlags & D3DPRESENTFLAG_MODEMASK) == (res.dwFlags & D3DPRESENTFLAG_MODEMASK))
-+      (resolutions[i].dwFlags & mask) == (res.dwFlags & mask))
-     {
-        return i;
-     }
-@@ -200,13 +204,14 @@ int CEGLNativeTypeRaspberryPI::FindMatchingResolution(const RESOLUTION_INFO &res
- #endif
- 
- #if defined(TARGET_RASPBERRY_PI)
--int CEGLNativeTypeRaspberryPI::AddUniqueResolution(RESOLUTION_INFO &res, std::vector<RESOLUTION_INFO> &resolutions)
-+int CEGLNativeTypeRaspberryPI::AddUniqueResolution(RESOLUTION_INFO &res, std::vector<RESOLUTION_INFO> &resolutions, bool desktop /* = false */)
- {
-   SetResolutionString(res);
--  int i = FindMatchingResolution(res, resolutions);
-+  int i = FindMatchingResolution(res, resolutions, desktop);
-   if (i>=0)
-   {  // don't replace a progressive resolution with an interlaced one of same resolution
--     resolutions[i] = res;
-+    if (!(res.dwFlags & D3DPRESENTFLAG_INTERLACED))
-+      resolutions[i] = res;
-   }
-   else
-   {
-@@ -224,25 +229,28 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
- 
-   DestroyDispmaxWindow();
- 
-+  RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
-   if(GETFLAGS_GROUP(res.dwFlags) && GETFLAGS_MODE(res.dwFlags))
-   {
-+    uint32_t mode3d = HDMI_3D_FORMAT_NONE;
-     sem_init(&m_tv_synced, 0, 0);
-     m_DllBcmHost->vc_tv_register_callback(CallbackTvServiceCallback, this);
- 
--    if (res.dwFlags & (D3DPRESENTFLAG_MODE3DSBS|D3DPRESENTFLAG_MODE3DTB))
-+    if (stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL || stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-     {
-       /* inform TV of any 3D settings. Note this property just applies to next hdmi mode change, so no need to call for 2D modes */
-       HDMI_PROPERTY_PARAM_T property;
-       property.property = HDMI_PROPERTY_3D_STRUCTURE;
-       if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOSCREEN_FRAMEPACKING) && CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_SUPPORTMVC) && res.fRefreshRate <= 30.0f)
-         property.param1 = HDMI_3D_FORMAT_FRAME_PACKING;
--      else if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-+      else if (stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-         property.param1 = HDMI_3D_FORMAT_SBS_HALF;
--      else if (res.dwFlags & D3DPRESENTFLAG_MODE3DTB)
-+      else if (stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-         property.param1 = HDMI_3D_FORMAT_TB_HALF;
-       else
-         property.param1 = HDMI_3D_FORMAT_NONE;
-       property.param2 = 0;
-+      mode3d = property.param1;
-       vc_tv_hdmi_set_property(&property);
-     }
- 
-@@ -261,19 +269,19 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
- 
-     if (success == 0)
-     {
--      CLog::Log(LOGDEBUG, "EGL set HDMI mode (%d,%d)=%d%s%s\n",
-+      CLog::Log(LOGDEBUG, "EGL set HDMI mode (%d,%d)=%d %s%s\n",
-                           GETFLAGS_GROUP(res.dwFlags), GETFLAGS_MODE(res.dwFlags), success,
--                          (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS) ? " SBS":"",
--                          (res.dwFlags & D3DPRESENTFLAG_MODE3DTB) ? " TB":"");
-+                          CStereoscopicsManager::GetInstance().ConvertGuiStereoModeToString(stereo_mode),
-+                          mode3d==HDMI_3D_FORMAT_FRAME_PACKING ? " FP" : mode3d==HDMI_3D_FORMAT_SBS_HALF ? " SBS" : mode3d==HDMI_3D_FORMAT_TB_HALF ? " TB" : "");
- 
-       sem_wait(&m_tv_synced);
-     }
-     else
-     {
--      CLog::Log(LOGERROR, "EGL failed to set HDMI mode (%d,%d)=%d%s%s\n",
-+      CLog::Log(LOGERROR, "EGL failed to set HDMI mode (%d,%d)=%d %s%s\n",
-                           GETFLAGS_GROUP(res.dwFlags), GETFLAGS_MODE(res.dwFlags), success,
--                          (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS) ? " SBS":"",
--                          (res.dwFlags & D3DPRESENTFLAG_MODE3DTB) ? " TB":"");
-+                          CStereoscopicsManager::GetInstance().ConvertGuiStereoModeToString(stereo_mode),
-+                          mode3d==HDMI_3D_FORMAT_FRAME_PACKING ? " FP" : mode3d==HDMI_3D_FORMAT_SBS_HALF ? " SBS" : mode3d==HDMI_3D_FORMAT_TB_HALF ? " TB" : "");
-     }
-     m_DllBcmHost->vc_tv_unregister_callback(CallbackTvServiceCallback);
-     sem_destroy(&m_tv_synced);
-@@ -336,9 +344,9 @@ bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
-   DISPMANX_TRANSFORM_T transform = DISPMANX_NO_ROTATE;
-   DISPMANX_UPDATE_HANDLE_T dispman_update = m_DllBcmHost->vc_dispmanx_update_start(0);
- 
--  if (res.dwFlags & D3DPRESENTFLAG_MODE3DSBS)
-+  if (stereo_mode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-     transform = DISPMANX_STEREOSCOPIC_SBS;
--  else if (res.dwFlags & D3DPRESENTFLAG_MODE3DTB)
-+  else if (stereo_mode == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-     transform = DISPMANX_STEREOSCOPIC_TB;
-   else
-     transform = DISPMANX_STEREOSCOPIC_MONO;
-@@ -445,10 +453,8 @@ static void SetResolutionString(RESOLUTION_INFO &res)
-   res.iWidth = gui_width;
-   res.iHeight = gui_height;
- 
--  res.strMode = StringUtils::Format("%dx%d (%dx%d) @ %.2f%s%s%s - Full Screen", res.iScreenWidth, res.iScreenHeight, res.iWidth, res.iHeight, res.fRefreshRate,
--    res.dwFlags & D3DPRESENTFLAG_INTERLACED ? "i" : "",
--    res.dwFlags & D3DPRESENTFLAG_MODE3DTB   ? " 3DTB" : "",
--    res.dwFlags & D3DPRESENTFLAG_MODE3DSBS  ? " 3DSBS" : "");
-+  res.strMode = StringUtils::Format("%dx%d (%dx%d) @ %.2f%s - Full Screen", res.iScreenWidth, res.iScreenHeight, res.iWidth, res.iHeight, res.fRefreshRate,
-+    res.dwFlags & D3DPRESENTFLAG_INTERLACED ? "i" : "");
- }
- 
- static SDTV_ASPECT_T get_sdtv_aspect_from_display_aspect(float display_aspect)
-@@ -503,17 +509,6 @@ bool CEGLNativeTypeRaspberryPI::ProbeResolutions(std::vector<RESOLUTION_INFO> &r
-       m_desktopRes.iScreenHeight= tv_state.display.hdmi.height;
-       m_desktopRes.dwFlags      = MAKEFLAGS(tv_state.display.hdmi.group, tv_state.display.hdmi.mode, tv_state.display.hdmi.scan_mode);
-       m_desktopRes.fPixelRatio  = tv_state.display.hdmi.display_options.aspect == 0 ? 1.0f : get_display_aspect_ratio((HDMI_ASPECT_T)tv_state.display.hdmi.display_options.aspect) / ((float)m_desktopRes.iScreenWidth / (float)m_desktopRes.iScreenHeight);
--      // Also add 3D flags
--      if (tv_state.display.hdmi.format_3d == HDMI_3D_FORMAT_SBS_HALF)
--      {
--        m_desktopRes.dwFlags |= D3DPRESENTFLAG_MODE3DSBS;
--        m_desktopRes.fPixelRatio *= 2.0;
--      }
--      else if (tv_state.display.hdmi.format_3d == HDMI_3D_FORMAT_TB_HALF)
--      {
--        m_desktopRes.dwFlags |= D3DPRESENTFLAG_MODE3DTB;
--        m_desktopRes.fPixelRatio *= 0.5;
--      }
-       HDMI_PROPERTY_PARAM_T property;
-       property.property = HDMI_PROPERTY_PIXEL_CLOCK_TYPE;
-       vc_tv_hdmi_get_property(&property);
-@@ -531,6 +526,18 @@ bool CEGLNativeTypeRaspberryPI::ProbeResolutions(std::vector<RESOLUTION_INFO> &r
-       m_desktopRes.fRefreshRate = (float)tv_state.display.sdtv.frame_rate;
-       m_desktopRes.fPixelRatio  = tv_state.display.hdmi.display_options.aspect == 0 ? 1.0f : get_display_aspect_ratio((SDTV_ASPECT_T)tv_state.display.sdtv.display_options.aspect) / ((float)m_desktopRes.iScreenWidth / (float)m_desktopRes.iScreenHeight);
-     }
-+    else if ((tv_state.state & VC_LCD_ATTACHED_DEFAULT) != 0) // lcd
-+    {
-+      m_desktopRes.iScreen      = 0;
-+      m_desktopRes.bFullScreen  = true;
-+      m_desktopRes.iWidth       = tv_state.display.sdtv.width;
-+      m_desktopRes.iHeight      = tv_state.display.sdtv.height;
-+      m_desktopRes.iScreenWidth = tv_state.display.sdtv.width;
-+      m_desktopRes.iScreenHeight= tv_state.display.sdtv.height;
-+      m_desktopRes.dwFlags      = MAKEFLAGS(HDMI_RES_GROUP_INVALID, 0, 0);
-+      m_desktopRes.fRefreshRate = (float)tv_state.display.sdtv.frame_rate;
-+      m_desktopRes.fPixelRatio  = tv_state.display.hdmi.display_options.aspect == 0 ? 1.0f : get_display_aspect_ratio((SDTV_ASPECT_T)tv_state.display.sdtv.display_options.aspect) / ((float)m_desktopRes.iScreenWidth / (float)m_desktopRes.iScreenHeight);
-+    }
- 
-     SetResolutionString(m_desktopRes);
- 
-@@ -541,11 +548,13 @@ bool CEGLNativeTypeRaspberryPI::ProbeResolutions(std::vector<RESOLUTION_INFO> &r
-     CLog::Log(LOGDEBUG, "EGL initial desktop resolution %s (%.2f)\n", m_desktopRes.strMode.c_str(), m_desktopRes.fPixelRatio);
-   }
- 
--  GetSupportedModes(HDMI_RES_GROUP_CEA, resolutions);
--  GetSupportedModes(HDMI_RES_GROUP_DMT, resolutions);
--
-+  if(GETFLAGS_GROUP(m_desktopRes.dwFlags) && GETFLAGS_MODE(m_desktopRes.dwFlags))
-   {
--    AddUniqueResolution(m_desktopRes, resolutions);
-+    GetSupportedModes(HDMI_RES_GROUP_DMT, resolutions);
-+    GetSupportedModes(HDMI_RES_GROUP_CEA, resolutions);
-+  }
-+  {
-+    AddUniqueResolution(m_desktopRes, resolutions, true);
-     CLog::Log(LOGDEBUG, "EGL probe resolution %s:%x\n", m_desktopRes.strMode.c_str(), m_desktopRes.dwFlags);
-   }
- 
-@@ -638,54 +647,16 @@ void CEGLNativeTypeRaspberryPI::GetSupportedModes(HDMI_RES_GROUP_T group, std::v
-       if (!m_desktopRes.dwFlags && prefer_group == group && prefer_mode == tv->code)
-         m_desktopRes = res;
- 
--      if (res.dwFlags & D3DPRESENTFLAG_INTERLACED)
--        continue;
--
-       AddUniqueResolution(res, resolutions);
-       CLog::Log(LOGDEBUG, "EGL mode %d: %s (%.2f) %s%s:%x\n", i, res.strMode.c_str(), res.fPixelRatio,
-           tv->native ? "N" : "", tv->scan_mode ? "I" : "", tv->code);
- 
--      if (tv->frame_rate == 24 || tv->frame_rate == 30 || tv->frame_rate == 60)
-+      if (tv->frame_rate == 24 || tv->frame_rate == 30 || tv->frame_rate == 48 || tv->frame_rate == 60 || tv->frame_rate == 72)
-       {
-         RESOLUTION_INFO res2 = res;
-         res2.fRefreshRate  = (float)tv->frame_rate * (1000.0f/1001.0f);
-         AddUniqueResolution(res2, resolutions);
-       }
--
--      // Also add 3D versions of modes
--      if (tv->struct_3d_mask & HDMI_3D_STRUCT_SIDE_BY_SIDE_HALF_HORIZONTAL)
--      {
--        RESOLUTION_INFO res2 = res;
--        res2.dwFlags |= D3DPRESENTFLAG_MODE3DSBS;
--        res2.fPixelRatio    = get_display_aspect_ratio((HDMI_ASPECT_T)tv->aspect_ratio) / ((float)res2.iScreenWidth / (float)res2.iScreenHeight);
--        res2.fPixelRatio   *= 2.0f;
--        res2.iSubtitles    = (int)(0.965 * res2.iHeight);
--
--        AddUniqueResolution(res2, resolutions);
--        CLog::Log(LOGDEBUG, "EGL mode %d: %s (%.2f)\n", i, res2.strMode.c_str(), res2.fPixelRatio);
--        if (tv->frame_rate == 24 || tv->frame_rate == 30 || tv->frame_rate == 60)
--        {
--          res2.fRefreshRate  = (float)tv->frame_rate * (1000.0f/1001.0f);
--          AddUniqueResolution(res2, resolutions);
--        }
--      }
--      if (tv->struct_3d_mask & HDMI_3D_STRUCT_TOP_AND_BOTTOM)
--      {
--        RESOLUTION_INFO res2 = res;
--        res2.dwFlags |= D3DPRESENTFLAG_MODE3DTB;
--        res2.fPixelRatio    = get_display_aspect_ratio((HDMI_ASPECT_T)tv->aspect_ratio) / ((float)res2.iScreenWidth / (float)res2.iScreenHeight);
--        res2.fPixelRatio   *= 0.5f;
--        res2.iSubtitles    = (int)(0.965 * res2.iHeight);
--
--        AddUniqueResolution(res2, resolutions);
--        CLog::Log(LOGDEBUG, "EGL mode %d: %s (%.2f)\n", i, res2.strMode.c_str(), res2.fPixelRatio);
--        if (tv->frame_rate == 24 || tv->frame_rate == 30 || tv->frame_rate == 60)
--        {
--          res2.fRefreshRate  = (float)tv->frame_rate * (1000.0f/1001.0f);
--          AddUniqueResolution(res2, resolutions);
--        }
--
--      }
-     }
-   }
-   if (supported_modes)
-diff --git a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h
-index a0acb1a..e5bcae7 100644
---- a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h
-+++ b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.h
-@@ -69,7 +69,7 @@ private:
-   static void CallbackTvServiceCallback(void *userdata, uint32_t reason, uint32_t param1, uint32_t param2);
- 
-   void DestroyDispmaxWindow();
--  int FindMatchingResolution(const RESOLUTION_INFO &res, const std::vector<RESOLUTION_INFO> &resolutions);
--  int AddUniqueResolution(RESOLUTION_INFO &res, std::vector<RESOLUTION_INFO> &resolutions);
-+  int FindMatchingResolution(const RESOLUTION_INFO &res, const std::vector<RESOLUTION_INFO> &resolutions, bool desktop);
-+  int AddUniqueResolution(RESOLUTION_INFO &res, std::vector<RESOLUTION_INFO> &resolutions, bool desktop = false);
- #endif
- };
-
-From 5d3349935e282c6d4faef746a5b8a9934676d4c6 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 22 Jun 2015 16:27:15 +0100
-Subject: [PATCH 18/93] Consider stereomode when creating a new window
-
-We might be changing from a non-3D to a 3D mode
----
- xbmc/windowing/egl/WinSystemEGL.cpp | 6 +++++-
- xbmc/windowing/egl/WinSystemEGL.h   | 1 +
- 2 files changed, 6 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/windowing/egl/WinSystemEGL.cpp b/xbmc/windowing/egl/WinSystemEGL.cpp
-index 718fb4c..a0b285c 100644
---- a/xbmc/windowing/egl/WinSystemEGL.cpp
-+++ b/xbmc/windowing/egl/WinSystemEGL.cpp
-@@ -52,6 +52,7 @@ CWinSystemEGL::CWinSystemEGL() : CWinSystemBase()
-   m_surface           = EGL_NO_SURFACE;
-   m_context           = EGL_NO_CONTEXT;
-   m_config            = NULL;
-+  m_stereo_mode       = RENDER_STEREO_MODE_OFF;
- 
-   m_egl               = NULL;
-   m_iVSyncMode        = 0;
-@@ -273,6 +274,7 @@ bool CWinSystemEGL::CreateNewWindow(const std::string& name, bool fullScreen, RE
- {
-   RESOLUTION_INFO current_resolution;
-   current_resolution.iWidth = current_resolution.iHeight = 0;
-+  RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
- 
-   m_nWidth        = res.iWidth;
-   m_nHeight       = res.iHeight;
-@@ -284,12 +286,14 @@ bool CWinSystemEGL::CreateNewWindow(const std::string& name, bool fullScreen, RE
-     current_resolution.iWidth == res.iWidth && current_resolution.iHeight == res.iHeight &&
-     current_resolution.iScreenWidth == res.iScreenWidth && current_resolution.iScreenHeight == res.iScreenHeight &&
-     m_bFullScreen == fullScreen && current_resolution.fRefreshRate == res.fRefreshRate &&
--    (current_resolution.dwFlags & D3DPRESENTFLAG_MODEMASK) == (res.dwFlags & D3DPRESENTFLAG_MODEMASK))
-+    (current_resolution.dwFlags & D3DPRESENTFLAG_MODEMASK) == (res.dwFlags & D3DPRESENTFLAG_MODEMASK) &&
-+    m_stereo_mode == stereo_mode)
-   {
-     CLog::Log(LOGDEBUG, "CWinSystemEGL::CreateNewWindow: No need to create a new window");
-     return true;
-   }
- 
-+  m_stereo_mode = stereo_mode;
-   m_bFullScreen   = fullScreen;
-   // Destroy any existing window
-   if (m_surface != EGL_NO_SURFACE)
-diff --git a/xbmc/windowing/egl/WinSystemEGL.h b/xbmc/windowing/egl/WinSystemEGL.h
-index 1ec4225..a33dedc 100644
---- a/xbmc/windowing/egl/WinSystemEGL.h
-+++ b/xbmc/windowing/egl/WinSystemEGL.h
-@@ -78,6 +78,7 @@ protected:
-   EGLSurface            m_surface;
-   EGLContext            m_context;
-   EGLConfig             m_config;
-+  RENDER_STEREO_MODE    m_stereo_mode;
- 
-   CEGLWrapper           *m_egl;
-   std::string           m_extensions;
-
-From 5d836aad86bfed970e902005bae5761415cec58d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 7 Apr 2014 18:19:32 +0100
-Subject: [PATCH 19/93] [rbp/omxplayer] When opening a stream don't try to
- update gui so often
-
----
- xbmc/dialogs/GUIDialogBusy.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/dialogs/GUIDialogBusy.cpp b/xbmc/dialogs/GUIDialogBusy.cpp
-index 6816b45..6cc5a8f 100644
---- a/xbmc/dialogs/GUIDialogBusy.cpp
-+++ b/xbmc/dialogs/GUIDialogBusy.cpp
-@@ -69,7 +69,11 @@ bool CGUIDialogBusy::WaitOnEvent(CEvent &event, unsigned int displaytime /* = 10
-     {
-       dialog->Open();
- 
-+#ifdef TARGET_RASPBERRY_PI
-+      while(!event.WaitMSec(100))
-+#else
-       while(!event.WaitMSec(1))
-+#endif
-       {
-         g_windowManager.ProcessRenderLoop(false);
-         if (allowCancel && dialog->IsCanceled())
-
-From e01575ea1b07d19332017fca0e1a51389b78d93d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 29 Apr 2014 15:23:22 +0100
-Subject: [PATCH 20/93] [ffmpeg] Speed up wtv index creation
-
-The index creation is O(N^2) with number of entries (typically thousands).
-On a Pi this can take more than 60 seconds to execute for a recording of a few hours.
-
-By replacing with an O(N) loop, this takes virtually zero time
----
- tools/depends/target/ffmpeg/Makefile               |  3 +-
- .../ffmpeg_Speed_up_wtv_index_creation.patch       | 47 ++++++++++++++++++++++
- 2 files changed, 49 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch
-
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index ae932ce..fcfc553 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -1,6 +1,6 @@
- include ../../Makefile.include
- include FFMPEG-VERSION
--DEPS= ../../Makefile.include FFMPEG-VERSION Makefile
-+DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -70,6 +70,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	rm -rf $(PLATFORM); mkdir -p $(PLATFORM)
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
- 	cd $(PLATFORM); sed -i".bak" -e "s%pkg_config_default=pkg-config%export PKG_CONFIG_LIBDIR=$(PREFIX)/lib/pkgconfig \&\& pkg_config_default=$(NATIVEPREFIX)/bin/pkg-config%" configure
-+	cd $(PLATFORM); patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
- 	./configure $(ffmpg_config)
-diff --git a/tools/depends/target/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch b/tools/depends/target/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch
-new file mode 100644
-index 0000000..d829898
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch
-@@ -0,0 +1,47 @@
-+commit 0e7427498cb1131671f6fe9d054245ae7e5a36f5
-+Author: popcornmix <popcornmix@gmail.com>
-+Date:   Tue Mar 25 19:43:07 2014 +0000
-+
-+    [ffmpeg] Speed up wtv index creation
-+
-+    The index creation is O(N^2) with number of entries (typically thousands).
-+    On a Pi this can take more than 60 seconds to execute for a recording of a few hours.
-+
-+    By replacing with an O(N) loop, this takes virtually zero time
-+
-+diff --git a/libavformat/wtvdec.c b/libavformat/wtvdec.c
-+index e423370..70898bd 100644
-+--- a/libavformat/wtvdec.c
-++++ b/libavformat/wtvdec.c
-+@@ -980,21 +980,23 @@ static int read_header(AVFormatContext *s)
-+                 pb = wtvfile_open(s, root, root_size, ff_timeline_table_0_entries_Events_le16);
-+                 if (pb) {
-+                     int i;
-++                    AVIndexEntry *e = wtv->index_entries;
-++                    AVIndexEntry *e_end = wtv->index_entries + wtv->nb_index_entries - 1;
-++                    uint64_t last_position = 0;
-+                     while (1) {
-+                         uint64_t frame_nb = avio_rl64(pb);
-+                         uint64_t position = avio_rl64(pb);
-++                        while (frame_nb > e->size && e <= e_end) {
-++                           e->pos = last_position;
-++                           e++;
-++                        }
-+                         if (avio_feof(pb))
-+                             break;
-+-                        for (i = wtv->nb_index_entries - 1; i >= 0; i--) {
-+-                            AVIndexEntry *e = wtv->index_entries + i;
-+-                            if (frame_nb > e->size)
-+-                                break;
-+-                            if (position > e->pos)
-+-                                e->pos = position;
-+-                        }
-++                        last_position = position;
-+                     }
-++                    e_end->pos = last_position;
-+                     wtvfile_close(pb);
-+-                    st->duration = wtv->index_entries[wtv->nb_index_entries - 1].timestamp;
-++                    st->duration = e_end->timestamp;
-+                 }
-+             }
-+         }
-
-From a29142db6e36056fd988b3199747c0da0dab78a0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 8 Mar 2014 15:36:06 +0000
-Subject: [PATCH 21/93] [hifiberry] Hack: force it to be recognised as IEC958
- capable to enable passthrough options
-
----
- xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-index e22db7a..0120bd5 100644
---- a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-+++ b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-@@ -1342,6 +1342,10 @@ void CAESinkALSA::EnumerateDevice(AEDeviceInfoList &list, const std::string &dev
-     if (snd_card_get_name(cardNr, &cardName) == 0)
-       info.m_displayName = cardName;
- 
-+    // hack: hifiberry digi doesn't correctly report as iec958 device. Needs fixing in kernel driver
-+    if (info.m_displayName == "snd_rpi_hifiberry_digi")
-+      info.m_deviceType = AE_DEVTYPE_IEC958;
-+
-     if (info.m_deviceType == AE_DEVTYPE_HDMI && info.m_displayName.size() > 5 &&
-         info.m_displayName.substr(info.m_displayName.size()-5) == " HDMI")
-     {
-
-From cad0f33be5e0b5989ece0863ba96158dbf5174d9 Mon Sep 17 00:00:00 2001
-From: Ben Avison <bavison@riscosopen.org>
-Date: Thu, 1 May 2014 16:28:39 +0100
-Subject: [PATCH 22/93] Improved file buffering in CArchive
-
-Even though memcpy is typically inlined by the compiler into byte/word loads
-and stores (at least for release builds), the frequency with which 1, 2 and 4
-byte loads/stores are encountered in cases where the size is *not*
-determinable at compile time is still high enough that it's worth handling
-these specially. On the ARM1176JZF-S in the Raspberry Pi, this improves the
-total time to open a library (in the case where it's fetched from a CArchive)
-by around 4%.
-
-It should be noted that this code uses 16-bit and 32-bit word loads and
-stores that are not necessarily aligned to their respective widths. It is
-possible that there are some architectures out there which do not support
-this, although all ARMs since ARMv6 have supported it (and ARMs earlier than
-that are probably not powerful enough to be good targets for XBMC).
----
- xbmc/utils/Archive.h | 16 ++++++++++++++++
- 1 file changed, 16 insertions(+)
-
-diff --git a/xbmc/utils/Archive.h b/xbmc/utils/Archive.h
-index 6ed0f8f..8506d95 100644
---- a/xbmc/utils/Archive.h
-+++ b/xbmc/utils/Archive.h
-@@ -154,9 +154,17 @@ protected:
-      * than waiting until we attempt to put more data into an already full buffer */
-     if (m_BufferRemain > size)
-     {
-+      switch (size)
-+      {
-+      case 1: *m_BufferPos++ = *ptr; m_BufferRemain--; break;
-+      case 2: *(uint16_t *) m_BufferPos = *(const uint16_t *) ptr; m_BufferPos += 2; m_BufferRemain -= 2; break;
-+      case 4: *(uint32_t *) m_BufferPos = *(const uint32_t *) ptr; m_BufferPos += 4; m_BufferRemain -= 4; break;
-+      default:
-       memcpy(m_BufferPos, ptr, size);
-       m_BufferPos += size;
-       m_BufferRemain -= size;
-+      break;
-+      }
-       return *this;
-     }
-     else
-@@ -171,9 +179,17 @@ protected:
-     /* Note, refilling the buffer is deferred until we know we need to read more from it */
-     if (m_BufferRemain >= size)
-     {
-+      switch (size)
-+      {
-+      case 1: *ptr = *m_BufferPos++; m_BufferRemain--; break;
-+      case 2: *(uint16_t *) ptr = *(const uint16_t *) m_BufferPos; m_BufferPos += 2; m_BufferRemain -= 2; break;
-+      case 4: *(uint32_t *) ptr = *(const uint32_t *) m_BufferPos; m_BufferPos += 4; m_BufferRemain -= 4; break;
-+      default:
-       memcpy(ptr, m_BufferPos, size);
-       m_BufferPos += size;
-       m_BufferRemain -= size;
-+      break;
-+      }
-       return *this;
-     }
-     else
-
-From 17eebeec762e4f1c921d886b6863ac4a21cdb2f0 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 12 Aug 2014 00:31:36 +0100
-Subject: [PATCH 23/93] [omxcodec] Don't force software codec with dvds
-
----
- xbmc/cores/dvdplayer/DVDPlayer.cpp | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayer.cpp b/xbmc/cores/dvdplayer/DVDPlayer.cpp
-index 4ffe33a..4b09e8f 100644
---- a/xbmc/cores/dvdplayer/DVDPlayer.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayer.cpp
-@@ -3501,7 +3501,9 @@ bool CDVDPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
-       hint.aspect = aspect;
-       hint.forced_aspect = true;
-     }
-+#ifndef TARGET_RASPBERRY_PI
-     hint.software = true;
-+#endif
-   }
-   else if (m_pInputStream && m_pInputStream->IsStreamType(DVDSTREAM_TYPE_PVRMANAGER))
-   {
-
-From 9da36b4157459cc72529ef6be5721f1ff6920ef6 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 10 Aug 2014 16:45:16 +0100
-Subject: [PATCH 24/93] filesystem: Make support of browsing into archives
- optional
-
-The ability to browse, scan and play content in archives can cause problems on low powered/low memory devices.
-It's quite common to see reports of a large rar file that causes xbmc to crash with an out-of-memory error when browsing or scanning.
-It also can be slow as any archive in the directory is opened and extracted.
-
-This causes issues for people who scan library with archives disabled, then subsequently enable it.
-The library has the .rar files in which don't play without removing and re-adding.
-
-We'll let people who don't use archives disable it manually
----
- addons/resource.language.en_gb/resources/strings.po | 9 +++++++++
- system/settings/settings.xml                        | 5 +++++
- xbmc/filesystem/FileDirectoryFactory.cpp            | 4 ++++
- 3 files changed, 18 insertions(+)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 01173ca..e908209 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -18036,6 +18036,15 @@ msgstr ""
- #: system/settings/rbp.xml
- msgctxt "#38010"
- msgid "GPU accelerated"
-+
-+#: system/settings/settings.xml
-+msgctxt "#38020"
-+msgid "Support browsing into archives"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38021"
-+msgid "Allow viewing and playing files in archives (e.g. zip, rar)"
- msgstr ""
- 
- #. Setting #38011 "Videos -> Library -> Show All Items entry"
-diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 76c9a33..7ca534d 100644
---- a/system/settings/settings.xml
-+++ b/system/settings/settings.xml
-@@ -317,6 +317,11 @@
-           <default>false</default>
-           <control type="toggle" />
-         </setting>
-+        <setting id="filelists.browsearchives" type="boolean" label="38020" help="38021">
-+          <level>1</level>
-+          <default>true</default>
-+          <control type="toggle" />
-+        </setting>
-       </group>
-     </category>
-     <category id="screensaver" label="360" help="36128">
-diff --git a/xbmc/filesystem/FileDirectoryFactory.cpp b/xbmc/filesystem/FileDirectoryFactory.cpp
-index a1d4ee4..4929283 100644
---- a/xbmc/filesystem/FileDirectoryFactory.cpp
-+++ b/xbmc/filesystem/FileDirectoryFactory.cpp
-@@ -40,6 +40,7 @@
- #include "playlists/PlayListFactory.h"
- #include "Directory.h"
- #include "File.h"
-+#include "settings/Settings.h"
- #include "FileItem.h"
- #include "utils/StringUtils.h"
- #include "URL.h"
-@@ -112,6 +113,8 @@ IFileDirectory* CFileDirectoryFactory::Create(const CURL& url, CFileItem* pItem,
-     return NULL;
-   }
- #endif
-+  if (CSettings::GetInstance().GetBool("filelists.browsearchives"))
-+  {
-   if (url.IsFileType("zip"))
-   {
-     CURL zipURL = URIUtils::CreateArchivePath("zip", url);
-@@ -185,6 +188,7 @@ IFileDirectory* CFileDirectoryFactory::Create(const CURL& url, CFileItem* pItem,
-     }
-     return NULL;
-   }
-+  }
-   if (url.IsFileType("xbt"))
-   {
-     CURL xbtUrl = URIUtils::CreateArchivePath("xbt", url);
-
-From b0231de02ec1821e136d75ff0f3986aaed8f0d92 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 27 Oct 2014 13:06:57 +0000
-Subject: [PATCH 25/93] [rbp] Make cachemembuffersize default depend on memory
- size
-
----
- xbmc/linux/RBP.cpp                 | 10 ++++++++++
- xbmc/linux/RBP.h                   |  1 +
- xbmc/settings/AdvancedSettings.cpp | 12 +++++++++++-
- 3 files changed, 22 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index 2a9a93a..6c5288d 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -23,6 +23,7 @@
- 
- #include <assert.h>
- #include "settings/Settings.h"
-+#include "settings/AdvancedSettings.h"
- #include "utils/log.h"
- 
- #include "cores/omxplayer/OMXImage.h"
-@@ -43,6 +44,12 @@ CRBP::~CRBP()
-   delete m_DllBcmHost;
- }
- 
-+void CRBP::InitializeSettings()
-+{
-+  if (m_initialized && g_advancedSettings.m_cacheMemBufferSize == ~0U)
-+    g_advancedSettings.m_cacheMemBufferSize = m_arm_mem < 256 ? 1024 * 1024 * 2 : 1024 * 1024 * 20;
-+}
-+
- bool CRBP::Initialize()
- {
-   CSingleLock lock (m_critSection);
-@@ -82,6 +89,8 @@ bool CRBP::Initialize()
-   if (!m_gui_resolution_limit)
-     m_gui_resolution_limit = m_gpu_mem < 128 ? 720:1080;
- 
-+  InitializeSettings();
-+
-   g_OMXImage.Initialize();
-   m_omx_image_init = true;
-   return true;
-@@ -94,6 +103,7 @@ void CRBP::LogFirmwareVerison()
-   response[sizeof(response) - 1] = '\0';
-   CLog::Log(LOGNOTICE, "Raspberry PI firmware version: %s", response);
-   CLog::Log(LOGNOTICE, "ARM mem: %dMB GPU mem: %dMB MPG2:%d WVC1:%d", m_arm_mem, m_gpu_mem, m_codec_mpg2_enabled, m_codec_wvc1_enabled);
-+  CLog::Log(LOGNOTICE, "cacheMemBufferSize: %dMB",  g_advancedSettings.m_cacheMemBufferSize >> 20);
-   m_DllBcmHost->vc_gencmd(response, sizeof response, "get_config int");
-   response[sizeof(response) - 1] = '\0';
-   CLog::Log(LOGNOTICE, "Config:\n%s", response);
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index 9559914..7fc8b42 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -48,6 +48,7 @@ public:
-   ~CRBP();
- 
-   bool Initialize();
-+  void InitializeSettings();
-   void LogFirmwareVerison();
-   void Deinitialize();
-   int GetArmMem() { return m_arm_mem; }
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index bc3aa8c..562757e 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -49,6 +49,9 @@
- #if defined(TARGET_DARWIN_IOS)
- #include "osx/DarwinUtils.h"
- #endif
-+#if defined(TARGET_RASPBERRY_PI)
-+#include "linux/RBP.h"
-+#endif
- 
- using namespace ADDON;
- using namespace XFILE;
-@@ -344,7 +347,12 @@ void CAdvancedSettings::Initialize()
-   m_bPVRAutoScanIconsUserSet       = false;
-   m_iPVRNumericChannelSwitchTimeout = 1000;
- 
-+#ifdef TARGET_RASPBERRY_PI
-+  // want default to be memory dependent, but interface to gpu not available yet, so set in RBP.cpp
-+  m_cacheMemBufferSize = ~0;
-+#else
-   m_cacheMemBufferSize = 1024 * 1024 * 20;
-+#endif
-   m_networkBufferMode = 0; // Default (buffer all internet streams/filesystems)
-   // the following setting determines the readRate of a player data
-   // as multiply of the default data read rate
-@@ -399,7 +407,9 @@ void CAdvancedSettings::Initialize()
-   #endif
- 
-   m_userAgent = g_sysinfo.GetUserAgent();
--
-+#ifdef TARGET_RASPBERRY_PI
-+  g_RBP.InitializeSettings();
-+#endif
-   m_initialized = true;
- }
- 
-
-From 6d080c7c800d2e1120b46c5490d64d80b4e63ad4 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 30 May 2014 14:58:43 +0100
-Subject: [PATCH 26/93] [settings] Experiment: Report DESKTOP resolution in
- video settings
-
----
- xbmc/settings/DisplaySettings.cpp | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/xbmc/settings/DisplaySettings.cpp b/xbmc/settings/DisplaySettings.cpp
-index 7993a73..761385b 100644
---- a/xbmc/settings/DisplaySettings.cpp
-+++ b/xbmc/settings/DisplaySettings.cpp
-@@ -683,6 +683,9 @@ void CDisplaySettings::SettingOptionsResolutionsFiller(const CSetting *setting,
-     std::vector<RESOLUTION_WHR> resolutions = g_Windowing.ScreenResolutions(info.iScreen, info.fRefreshRate);
-     for (std::vector<RESOLUTION_WHR>::const_iterator resolution = resolutions.begin(); resolution != resolutions.end(); ++resolution)
-     {
-+if (resolution->ResInfo_Index == RES_DESKTOP)
-+      list.push_back(std::make_pair(StringUtils::Format("DESKTOP"), resolution->ResInfo_Index));
-+else
-       list.push_back(std::make_pair(
-         StringUtils::Format("%dx%d%s", resolution->width, resolution->height,
-                             ModeFlagsToString(resolution->flags, false).c_str()),
-
-From 80f582c6ced4a245d0cabb97a3e9fefc009e096d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 19 Sep 2014 11:54:49 +0100
-Subject: [PATCH 27/93] [dvdplayer/rbp] Add pi specific option to maintain
- vsync with pll adjustment
-
-New A/V sync option in settings/video/playback to do "Adjust PLL".
-This uses video clock (so perfect video syncing) but avoids having to resample
-or drop/dupe audio packets which is normally required.
-
-Needed updated firmware
-
-[dvdplayeraudio] Add advancedsetting for configuring max pll adjustment
-
-[dvdplayer] Allow pll adjustment to go higher, but tail off more gradually
----
- .../resource.language.en_gb/resources/strings.po   | 23 +++++++++++++-
- system/settings/settings.xml                       | 14 +++++++++
- xbmc/cores/AudioEngine/Utils/AEUtil.h              |  3 +-
- xbmc/cores/dvdplayer/DVDPlayerAudio.cpp            | 36 +++++++++++++++++++---
- xbmc/cores/dvdplayer/DVDPlayerAudio.h              |  3 ++
- xbmc/linux/RBP.cpp                                 | 13 ++++++++
- xbmc/linux/RBP.h                                   |  3 ++
- xbmc/settings/AdvancedSettings.cpp                 |  2 ++
- xbmc/settings/AdvancedSettings.h                   |  1 +
- 9 files changed, 91 insertions(+), 7 deletions(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index e908209..0f45ea0 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -6609,7 +6609,22 @@ msgctxt "#13459"
- msgid "Use OMXPlayer for decoding of video files."
- msgstr ""
- 
--#empty strings from id 13460 to 13504
-+#empty strings from id 13460 to 13499
-+
-+#: system/settings/settings.xml
-+msgctxt "#13500"
-+msgid "A/V sync method"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#13503"
-+msgid "Resample audio"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#13504"
-+msgid "Adjust PLL"
-+msgstr ""
- 
- #: system/settings/settings.xml
- msgctxt "#13505"
-@@ -18141,3 +18156,9 @@ msgstr ""
- msgctxt "#38030"
- msgid "This option uses frame-packing to output full resolution for 3D through HDMI.[CR]Enabling this improves quality of Multiview Video Coding (MVC) videos, but may not be supported by all displays."
- msgstr ""
-+
-+#. Description of setting "Videos -> Playback -> A/V sync method" with label #13500
-+#: system/settings/settings.xml
-+msgctxt "#38006"
-+msgid "Audio has to stay in sync, this can either be done by resampling, or adjusting the PLL"
-+msgstr ""
-diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 7ca534d..1b57136 100644
---- a/system/settings/settings.xml
-+++ b/system/settings/settings.xml
-@@ -595,6 +595,20 @@
-           <default>false</default>
-           <control type="toggle" />
-         </setting>
-+        <setting id="videoplayer.synctype" type="integer" parent="videoplayer.usedisplayasclock" label="13500" help="38006">
-+          <level>2</level>
-+          <default>2</default> <!-- SYNC_RESAMPLE -->
-+          <constraints>
-+            <options>
-+              <option label="13503">2</option> <!-- SYNC_RESAMPLE -->
-+              <option label="13504">3</option> <!-- SYNC_PLLADJUST -->
-+            </options>
-+          </constraints>
-+          <dependencies>
-+            <dependency type="enable" setting="videoplayer.usedisplayasclock" operator="is">true</dependency>
-+          </dependencies>
-+          <control type="spinner" format="string" />
-+        </setting>
-         <setting id="videoplayer.errorinaspect" type="integer" label="22021" help="36170">
-           <level>2</level>
-           <default>0</default>
-diff --git a/xbmc/cores/AudioEngine/Utils/AEUtil.h b/xbmc/cores/AudioEngine/Utils/AEUtil.h
-index 56c0a1f..f7f63b5 100644
---- a/xbmc/cores/AudioEngine/Utils/AEUtil.h
-+++ b/xbmc/cores/AudioEngine/Utils/AEUtil.h
-@@ -57,7 +57,8 @@ enum AVSync
- {
-   SYNC_DISCON   = 0,
-   SYNC_SKIPDUP,
--  SYNC_RESAMPLE
-+  SYNC_RESAMPLE,
-+  SYNC_PLLADJUST
- };
- 
- struct AEDelayStatus
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp b/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-index 39074ff..97a23a6 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-@@ -24,6 +24,7 @@
- #include "DVDCodecs/Audio/DVDAudioCodec.h"
- #include "DVDCodecs/DVDFactoryCodec.h"
- #include "settings/Settings.h"
-+#include "settings/AdvancedSettings.h"
- #include "video/VideoReferenceClock.h"
- #include "utils/log.h"
- #include "utils/MathUtils.h"
-@@ -109,6 +110,9 @@ CDVDPlayerAudio::CDVDPlayerAudio(CDVDClock* pClock, CDVDMessageQueue& parent)
-   m_started = false;
-   m_silence = false;
-   m_resampleratio = 1.0;
-+  m_plladjust = 1.0;
-+  m_last_plladjust = 1.0;
-+  m_last_error = 0.0;
-   m_synctype = SYNC_DISCON;
-   m_setsynctype = SYNC_DISCON;
-   m_prevsynctype = -1;
-@@ -182,11 +186,13 @@ void CDVDPlayerAudio::OpenStream( CDVDStreamInfo &hints, CDVDAudioCodec* codec )
-   m_synctype = SYNC_DISCON;
-   m_setsynctype = SYNC_DISCON;
-   if (CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEDISPLAYASCLOCK))
--    m_setsynctype = SYNC_RESAMPLE;
-+    m_setsynctype = CSettings::GetInstance().GetInt("videoplayer.synctype");
-   m_prevsynctype = -1;
- 
-   m_error = 0;
-   m_errors.Flush();
-+  m_plladjust = 1.0;
-+  m_last_plladjust = 1.0;
-   m_integral = 0;
-   m_prevskipped = false;
-   m_syncclock = true;
-@@ -229,7 +235,6 @@ void CDVDPlayerAudio::CloseStream(bool bWaitForBuffers)
- 
-   // uninit queue
-   m_messageQueue.End();
--
-   CLog::Log(LOGNOTICE, "Deleting audio codec");
-   if (m_pAudioCodec)
-   {
-@@ -482,7 +487,11 @@ void CDVDPlayerAudio::UpdatePlayerInfo()
-   //print the inverse of the resample ratio, since that makes more sense
-   //if the resample ratio is 0.5, then we're playing twice as fast
-   if (m_synctype == SYNC_RESAMPLE)
--    s << ", rr:" << std::fixed << std::setprecision(5) << 1.0 / m_resampleratio;
-+    s << ", rr:" << std::fixed << std::setprecision(5) << 1.0 / m_resampleratio << ", err:" << std::fixed << std::setprecision(1) << m_last_error * 1e-3 << "ms";
-+  if (m_synctype == SYNC_SKIPDUP)
-+    s << ", err:" << std::fixed << std::setprecision(1) << m_last_error * 1e-3 << "ms";
-+  if (m_synctype == SYNC_PLLADJUST)
-+    s << ", pll:" << std::fixed << std::setprecision(5) << g_RBP.GetAdjustHDMIClock() << ", err:" << std::fixed << std::setprecision(1) << m_last_error * 1e-3 << "ms";
- 
-   s << ", att:" << std::fixed << std::setprecision(1) << log(GetCurrentAttenuation()) * 20.0f << " dB";
- 
-@@ -637,8 +646,8 @@ void CDVDPlayerAudio::SetSyncType(bool passthrough)
- 
-   if (m_synctype != m_prevsynctype)
-   {
--    const char *synctypes[] = {"clock feedback", "skip/duplicate", "resample", "invalid"};
--    int synctype = (m_synctype >= 0 && m_synctype <= 2) ? m_synctype : 3;
-+    const char *synctypes[] = {"clock feedback", "skip/duplicate", "resample", "pll adjust", "invalid"};
-+    int synctype = (m_synctype >= 0 && m_synctype <= 3) ? m_synctype : 4;
-     CLog::Log(LOGDEBUG, "CDVDPlayerAudio:: synctype set to %i: %s", m_synctype, synctypes[synctype]);
-     m_prevsynctype = m_synctype;
-   }
-@@ -748,7 +757,19 @@ void CDVDPlayerAudio::HandleSyncError(double duration)
-       proportional = m_error / DVD_TIME_BASE / proportionaldiv;
-     }
-     m_resampleratio = 1.0 / m_pClock->GetClockSpeed() + proportional + m_integral;
-+    CLog::Log(LOGDEBUG, "CDVDPlayerAudio::%s rr:%.5f error:%.3fms", __FUNCTION__, m_resampleratio, m_error * 1e-3);
-+  }
-+  else if (m_synctype == SYNC_PLLADJUST)
-+  {
-+#if defined(TARGET_RASPBERRY_PI)
-+    double e = std::max(std::min(m_error / DVD_MSEC_TO_TIME(50), 1.0), -1.0);
-+    double adjust = g_advancedSettings.m_maxPllAdjust * 1e-6;
-+    m_plladjust = 1.0 + e * adjust;
-+    m_last_plladjust = g_RBP.AdjustHDMIClock(m_plladjust);
-+    CLog::Log(LOGDEBUG, "CDVDPlayerAudio::%s pll:%.5f (%.5f) error:%.6f e:%.6f a:%f", __FUNCTION__, m_plladjust, m_last_plladjust, m_error, e * adjust, adjust );
-+#endif
-   }
-+  m_last_error = m_error;
- }
- 
- bool CDVDPlayerAudio::OutputPacket(DVDAudioFrame &audioframe)
-@@ -801,6 +822,7 @@ bool CDVDPlayerAudio::OutputPacket(DVDAudioFrame &audioframe)
-     {
-       m_dvdAudio.AddPackets(audioframe);
-     }
-+    m_plladjust = 1.0;
-   }
-   else if (m_synctype == SYNC_DISCON)
-   {
-@@ -835,6 +857,10 @@ bool CDVDPlayerAudio::OutputPacket(DVDAudioFrame &audioframe)
-     m_dvdAudio.SetResampleRatio(m_resampleratio);
-     m_dvdAudio.AddPackets(audioframe);
-   }
-+  else if (m_synctype == SYNC_PLLADJUST)
-+  {
-+    m_dvdAudio.AddPackets(audioframe);
-+  }
- 
-   return true;
- }
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerAudio.h b/xbmc/cores/dvdplayer/DVDPlayerAudio.h
-index 014574d..409b2d7 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerAudio.h
-+++ b/xbmc/cores/dvdplayer/DVDPlayerAudio.h
-@@ -228,6 +228,9 @@ protected:
-   bool   m_prevskipped;
-   double m_maxspeedadjust;
-   double m_resampleratio; //resample ratio when using SYNC_RESAMPLE, used for the codec info
-+  double m_plladjust;    // for display using SYNC_PLLADJUST
-+  double m_last_error;    // for display using SYNC_PLLADJUST
-+  double m_last_plladjust;    // for display using SYNC_PLLADJUST
- 
-   struct SInfo
-   {
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index 6c5288d..a79d6d9 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -35,6 +35,7 @@ CRBP::CRBP()
-   m_DllBcmHost      = new DllBcmHost();
-   m_OMX             = new COMXCore();
-   m_display = DISPMANX_NO_HANDLE;
-+  m_last_pll_adjust = 1.0;
- }
- 
- CRBP::~CRBP()
-@@ -225,4 +226,16 @@ void CRBP::Deinitialize()
-   m_initialized     = false;
-   m_omx_initialized = false;
- }
-+
-+double CRBP::AdjustHDMIClock(double adjust)
-+{
-+  char response[80];
-+  vc_gencmd(response, sizeof response, "hdmi_adjust_clock %f", adjust);
-+  char *p = strchr(response, '=');
-+  if (p)
-+    m_last_pll_adjust = atof(p+1);
-+  CLog::Log(LOGDEBUG, "CRBP::%s(%.4f) = %.4f", __func__, adjust, m_last_pll_adjust);
-+  return m_last_pll_adjust;
-+}
-+
- #endif
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index 7fc8b42..db2fade 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -64,6 +64,8 @@ public:
-   unsigned char *CaptureDisplay(int width, int height, int *stride, bool swap_red_blue, bool video_only = true);
-   DllOMX *GetDllOMX() { return m_OMX ? m_OMX->GetDll() : NULL; }
-   void WaitVsync();
-+  double AdjustHDMIClock(double adjust);
-+  double GetAdjustHDMIClock() { return m_last_pll_adjust; }
- 
- private:
-   DllBcmHost *m_DllBcmHost;
-@@ -80,6 +82,7 @@ private:
-   CEvent     m_vsync;
-   class DllLibOMXCore;
-   CCriticalSection m_critSection;
-+  double m_last_pll_adjust;
- };
- 
- extern CRBP g_RBP;
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 562757e..22b8459 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -116,6 +116,7 @@ void CAdvancedSettings::Initialize()
-   m_audioHeadRoom = 0;
-   m_ac3Gain = 12.0f;
-   m_audioApplyDrc = -1.0f;
-+  m_maxPllAdjust = 1000;
-   m_dvdplayerIgnoreDTSinWAV = false;
- 
-   //default hold time of 25 ms, this allows a 20 hertz sine to pass undistorted
-@@ -467,6 +468,7 @@ void CAdvancedSettings::ParseSettingsFile(const std::string &file)
-   if (pElement)
-   {
-     XMLUtils::GetFloat(pElement, "ac3downmixgain", m_ac3Gain, -96.0f, 96.0f);
-+    XMLUtils::GetInt(pElement, "maxplladjust", m_maxPllAdjust, 0, 1000000);
-     XMLUtils::GetInt(pElement, "headroom", m_audioHeadRoom, 0, 12);
-     XMLUtils::GetString(pElement, "defaultplayer", m_audioDefaultPlayer);
-     // 101 on purpose - can be used to never automark as watched
-diff --git a/xbmc/settings/AdvancedSettings.h b/xbmc/settings/AdvancedSettings.h
-index 6475350..93de9bd 100644
---- a/xbmc/settings/AdvancedSettings.h
-+++ b/xbmc/settings/AdvancedSettings.h
-@@ -143,6 +143,7 @@ class CAdvancedSettings : public ISettingCallback, public ISettingsHandler
- 
-     int m_audioHeadRoom;
-     float m_ac3Gain;
-+    int m_maxPllAdjust;
-     std::string m_audioDefaultPlayer;
-     float m_audioPlayCountMinimumPercent;
-     bool m_dvdplayerIgnoreDTSinWAV;
-
-From cecfb10575958e190cf3c6394ff2158bff6fe52a Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 24 Sep 2014 23:13:52 +0100
-Subject: [PATCH 28/93] [audio] Add settings option to boost centre channel
- when downmixing
-
-This allows a dB volume increase to be added to centre channel.
-This can help improve dialgue in the presence of background music/effects.
-It can go up to 30dB for testing purposes, but value of 6 is probably more reasonable.
-It is recommended to ensure "Normalise levels on downmix" is enabled when boosting by large values to avoid clipping.
-
-Should work with Pi Sink (dvdplayer/paplayer) and omxplayer
----
- addons/resource.language.en_gb/resources/strings.po       | 15 +++++++++++++++
- system/settings/settings.xml                              | 12 ++++++++++++
- .../Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp           |  7 +++++++
- .../AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp   |  6 ++++++
- xbmc/cores/omxplayer/OMXAudio.cpp                         |  6 ++++++
- 5 files changed, 46 insertions(+)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 0f45ea0..cc486da 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -18162,3 +18162,18 @@ msgstr ""
- msgctxt "#38006"
- msgid "Audio has to stay in sync, this can either be done by resampling, or adjusting the PLL"
- msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38007"
-+msgid "Boost centre channel when downmixing"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38008"
-+msgid "Increase this value to make the dialogue louder compared to background sounds when downmixing multichannel audio"
-+msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38009"
-+msgid "%i dB"
-+msgstr ""
-diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 1b57136..918e8bf 100644
---- a/system/settings/settings.xml
-+++ b/system/settings/settings.xml
-@@ -2558,6 +2558,18 @@
-           <default>true</default>
-           <control type="toggle" />
-         </setting>
-+         <setting id="audiooutput.boostcenter" type="integer" label="38007" help="38008">
-+          <level>2</level>
-+          <default>0</default>
-+          <constraints>
-+            <minimum>0</minimum>
-+            <step>1</step>
-+            <maximum>30</maximum>
-+          </constraints>
-+          <control type="spinner" format="string">
-+            <formatlabel>38009</formatlabel>
-+          </control>
-+        </setting>
-         <setting id="audiooutput.processquality" type="integer" label="13505" help="36169">
-           <requirement>HAS_AE_QUALITY_LEVELS</requirement>
-           <level>2</level>
-diff --git a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp
-index e4ddf9e..625ea88 100644
---- a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp
-+++ b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResampleFFMPEG.cpp
-@@ -20,6 +20,7 @@
- 
- #include "cores/AudioEngine/Utils/AEUtil.h"
- #include "ActiveAEResampleFFMPEG.h"
-+#include "settings/Settings.h"
- #include "utils/log.h"
- 
- extern "C" {
-@@ -105,6 +106,12 @@ bool CActiveAEResampleFFMPEG::Init(uint64_t dst_chan_layout, int dst_channels, i
-   {
-      av_opt_set_double(m_pContext, "rematrix_maxval", 1.0, 0);
-   }
-+  int boost_center = CSettings::GetInstance().GetInt("audiooutput.boostcenter");
-+  if (boost_center)
-+  {
-+    float gain = pow(10.0f, ((float)(-3 + boost_center))/20.0f);
-+    av_opt_set_double(m_pContext, "center_mix_level", gain, 0);
-+  }
- 
-   if (remapLayout)
-   {
-diff --git a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp
-index 7807149..698a6ae 100644
---- a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp
-+++ b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEResamplePi.cpp
-@@ -164,6 +164,12 @@ bool CActiveAEResamplePi::Init(uint64_t dst_chan_layout, int dst_channels, int d
-   {
-     av_opt_set_double(m_pContext, "rematrix_maxval", 1.0, 0);
-   }
-+  int boost_center = CSettings::GetInstance().GetInt("audiooutput.boostcenter");
-+  if (boost_center)
-+  {
-+    float gain = pow(10.0f, ((float)(-3 + boost_center))/20.0f);
-+    av_opt_set_double(m_pContext, "center_mix_level", gain, 0);
-+  }
- 
-   if (remapLayout)
-   {
-diff --git a/xbmc/cores/omxplayer/OMXAudio.cpp b/xbmc/cores/omxplayer/OMXAudio.cpp
-index 08b1b84..70d0866 100644
---- a/xbmc/cores/omxplayer/OMXAudio.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudio.cpp
-@@ -641,6 +641,12 @@ bool COMXAudio::Initialize(AEAudioFormat format, OMXClock *clock, CDVDStreamInfo
-     {
-        av_opt_set_double(m_pContext, "rematrix_maxval", 1.0, 0);
-     }
-+    int boost_center = CSettings::GetInstance().GetInt("audiooutput.boostcenter");
-+    if (boost_center)
-+    {
-+      float gain = pow(10.0f, ((float)(-3 + boost_center))/20.0f);
-+      av_opt_set_double(m_pContext, "center_mix_level", gain, 0);
-+    }
- 
-     // stereo upmix
-     if (upmix && m_src_channels == 2 && m_dst_channels > 2)
-
-From cd089d7903e1fd4e0812ad817126a19d07fa896d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 27 Oct 2014 15:23:51 +0000
-Subject: [PATCH 29/93] [rbp] Default extract thumbnails to false
-
-It can take 80 seconds for a single file on a Pi. It can cause crashes with out-of-memory errors.
-It genereates a lot of support issues. Best to default to disabled and let users enable it if they must
----
- system/settings/rbp.xml | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
-diff --git a/system/settings/rbp.xml b/system/settings/rbp.xml
-index 7a170c2..1506035 100644
---- a/system/settings/rbp.xml
-+++ b/system/settings/rbp.xml
-@@ -42,6 +42,16 @@
-         </setting>
-       </group>
-     </category>
-+    <category id="myvideos">
-+      <group id="1">
-+        <setting id="myvideos.extractflags">
-+          <default>false</default>
-+        </setting>
-+        <setting id="myvideos.extractthumb">
-+          <default>false</default>
-+        </setting>
-+      </group>
-+    </category>
-   </section>
- 
-   <section id="system">
-
-From c0b8590f78235540d82d478334c7f30fae417754 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 10 Feb 2015 15:29:16 +0000
-Subject: [PATCH 30/93] [libcec] Add repeating keypress patch from popcornmix'
- repo
-
----
- tools/depends/target/libcec/Makefile         |   1 +
- tools/depends/target/libcec/popcornmix.patch | 859 +++++++++++++++++++++++++++
- 2 files changed, 860 insertions(+)
- create mode 100644 tools/depends/target/libcec/popcornmix.patch
-
-diff --git a/tools/depends/target/libcec/Makefile b/tools/depends/target/libcec/Makefile
-index f54af9e..ddf9963 100644
---- a/tools/depends/target/libcec/Makefile
-+++ b/tools/depends/target/libcec/Makefile
-@@ -21,6 +21,7 @@ $(TARBALLS_LOCATION)/$(ARCHIVE):
- $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	rm -rf $(PLATFORM); mkdir -p $(PLATFORM)/build
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
-+	cd $(PLATFORM); patch -p1 < ../popcornmix.patch
- 	cd $(PLATFORM)/build; $(CMAKE) -DBUILD_SHARED_LIBS=1 -DSKIP_PYTHON_WRAPPER:STRING=1 -DCMAKE_INSTALL_LIBDIR=$(PREFIX)/lib ..
- 
- $(LIBDYLIB): $(PLATFORM)
-diff --git a/tools/depends/target/libcec/popcornmix.patch b/tools/depends/target/libcec/popcornmix.patch
-new file mode 100644
-index 0000000..8366a69
---- /dev/null
-+++ b/tools/depends/target/libcec/popcornmix.patch
-@@ -0,0 +1,859 @@
-+From ec982e9800ae312972d306b67779215a2add6cde Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Fri, 24 Oct 2014 13:45:21 +0100
-+Subject: [PATCH 1/6] Make released key polling wait for exact time until key
-+ gets released
-+
-+---
-+ src/libcec/CECClient.cpp    | 16 ++++++++++++++--
-+ src/libcec/CECClient.h      |  2 +-
-+ src/libcec/CECProcessor.cpp |  8 +++++---
-+ src/libcec/LibCEC.cpp       | 10 ++++++++--
-+ src/libcec/LibCEC.h         |  4 +++-
-+ 5 files changed, 31 insertions(+), 9 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index 35c2d3e..e307c0e 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -1067,7 +1067,7 @@ void CCECClient::SetCurrentButton(const cec_user_control_code iButtonCode)
-+   AddKey(key);
-+ }
-+ 
-+-void CCECClient::CheckKeypressTimeout(void)
-++uint16_t CCECClient::CheckKeypressTimeout(void)
-+ {
-+   cec_keypress key;
-+ 
-+@@ -1091,12 +1091,24 @@ void CCECClient::CheckKeypressTimeout(void)
-+     }
-+     else
-+     {
-+-      return;
-++      // time when this keypress will be released and we'd like to be called again
-++      unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-++      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton == comboKey && iTimeoutMs > 0)
-++        timeout = iTimeoutMs - (iNow - m_buttontime) + 1;
-++      else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey)
-++        timeout = CEC_BUTTON_TIMEOUT - (iNow - m_buttontime) + 1;
-++      if (timeout > CEC_PROCESSOR_SIGNAL_WAIT_TIME)
-++      {
-++        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_buttontime*1e-3, CEC_BUTTON_TIMEOUT*1e-3, m_iCurrentButton);
-++        timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-++      }
-++      return timeout;
-+     }
-+   }
-+ 
-+   LIB_CEC->AddLog(CEC_LOG_DEBUG, "key auto-released: %s (%1x)", ToString(key.keycode), key.keycode);
-+   QueueAddKey(key);
-++  return CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+ }
-+ 
-+ bool CCECClient::EnableCallbacks(void *cbParam, ICECCallbacks *callbacks)
-+diff --git a/src/libcec/CECClient.h b/src/libcec/CECClient.h
-+index 12f8a3b..c9ce5e3 100644
-+--- a/src/libcec/CECClient.h
-++++ b/src/libcec/CECClient.h
-+@@ -272,7 +272,7 @@ namespace CEC
-+     virtual void                  AddKey(bool bSendComboKey = false);
-+     virtual void                  AddKey(const cec_keypress &key);
-+     virtual void                  SetCurrentButton(const cec_user_control_code iButtonCode);
-+-    virtual void                  CheckKeypressTimeout(void);
-++    virtual uint16_t              CheckKeypressTimeout(void);
-+     virtual void                  SourceActivated(const cec_logical_address logicalAddress);
-+     virtual void                  SourceDeactivated(const cec_logical_address logicalAddress);
-+ 
-+diff --git a/src/libcec/CECProcessor.cpp b/src/libcec/CECProcessor.cpp
-+index 99f71aa..604b950 100644
-+--- a/src/libcec/CECProcessor.cpp
-++++ b/src/libcec/CECProcessor.cpp
-+@@ -52,7 +52,6 @@
-+ using namespace CEC;
-+ using namespace PLATFORM;
-+ 
-+-#define CEC_PROCESSOR_SIGNAL_WAIT_TIME 1000
-+ #define ACTIVE_SOURCE_CHECK_INTERVAL   500
-+ #define TV_PRESENT_CHECK_INTERVAL      30000
-+ 
-+@@ -260,6 +259,7 @@ bool CCECProcessor::OnCommandReceived(const cec_command &command)
-+ 
-+ void *CCECProcessor::Process(void)
-+ {
-++  uint16_t timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   m_libcec->AddLog(CEC_LOG_DEBUG, "processor thread started");
-+ 
-+   if (!m_connCheck)
-+@@ -274,13 +274,13 @@ void *CCECProcessor::Process(void)
-+   while (!IsStopped() && m_communication->IsOpen())
-+   {
-+     // wait for a new incoming command, and process it
-+-    if (m_inBuffer.Pop(command, CEC_PROCESSOR_SIGNAL_WAIT_TIME))
-++    if (m_inBuffer.Pop(command, timeout))
-+       ProcessCommand(command);
-+ 
-+     if (CECInitialised() && !IsStopped())
-+     {
-+       // check clients for keypress timeouts
-+-      m_libcec->CheckKeypressTimeout();
-++      timeout = m_libcec->CheckKeypressTimeout();
-+ 
-+       // check if we need to replace handlers
-+       ReplaceHandlers();
-+@@ -311,6 +311,8 @@ void *CCECProcessor::Process(void)
-+         tvPresentCheck.Init(TV_PRESENT_CHECK_INTERVAL);
-+       }
-+     }
-++    else
-++      timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   }
-+ 
-+   return NULL;
-+diff --git a/src/libcec/LibCEC.cpp b/src/libcec/LibCEC.cpp
-+index af36b79..5ccb8dd 100644
-+--- a/src/libcec/LibCEC.cpp
-++++ b/src/libcec/LibCEC.cpp
-+@@ -361,11 +361,17 @@ bool CLibCEC::IsValidPhysicalAddress(uint16_t iPhysicalAddress)
-+          iPhysicalAddress <= CEC_MAX_PHYSICAL_ADDRESS;
-+ }
-+ 
-+-void CLibCEC::CheckKeypressTimeout(void)
-++uint16_t CLibCEC::CheckKeypressTimeout(void)
-+ {
-++  uint16_t timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   // check all clients
-+   for (std::vector<CECClientPtr>::iterator it = m_clients.begin(); it != m_clients.end(); it++)
-+-    (*it)->CheckKeypressTimeout();
-++  {
-++    uint16_t t = (*it)->CheckKeypressTimeout();
-++    if (t < timeout)
-++      timeout = t;
-++  }
-++  return timeout;
-+ }
-+ 
-+ void CLibCEC::AddLog(const cec_log_level level, const char *strFormat, ...)
-+diff --git a/src/libcec/LibCEC.h b/src/libcec/LibCEC.h
-+index 6d9a229..d9d1e7b 100644
-+--- a/src/libcec/LibCEC.h
-++++ b/src/libcec/LibCEC.h
-+@@ -39,6 +39,8 @@
-+ #include "CECTypeUtils.h"
-+ #include <memory>
-+ 
-++#define CEC_PROCESSOR_SIGNAL_WAIT_TIME 1000
-++
-+ namespace CEC
-+ {
-+   class CAdapterCommunication;
-+@@ -125,7 +127,7 @@ namespace CEC
-+ 
-+       void AddLog(const cec_log_level level, const char *strFormat, ...);
-+       void AddCommand(const cec_command &command);
-+-      void CheckKeypressTimeout(void);
-++      uint16_t CheckKeypressTimeout(void);
-+       void Alert(const libcec_alert type, const libcec_parameter &param);
-+ 
-+       static bool IsValidPhysicalAddress(uint16_t iPhysicalAddress);
-+-- 
-+1.9.1
-+
-+
-+From 41f0f3ec9ac136da3565c96fd5a7075499f3938d Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Fri, 24 Oct 2014 13:51:34 +0100
-+Subject: [PATCH 2/6] Keep track of time since initial button press and last
-+ button update
-+
-+---
-+ src/libcec/CECClient.cpp | 44 +++++++++++++++++++++++++++-----------------
-+ src/libcec/CECClient.h   |  3 ++-
-+ 2 files changed, 29 insertions(+), 18 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index e307c0e..e7935b9 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -54,7 +54,8 @@ CCECClient::CCECClient(CCECProcessor *processor, const libcec_configuration &con
-+     m_bInitialised(false),
-+     m_bRegistered(false),
-+     m_iCurrentButton(CEC_USER_CONTROL_CODE_UNKNOWN),
-+-    m_buttontime(0),
-++    m_initialButtontime(0),
-++    m_updateButtontime(0),
-+     m_iPreventForwardingPowerOffCommand(0),
-+     m_iLastKeypressTime(0)
-+ {
-+@@ -981,9 +982,10 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */)
-+     CLockObject lock(m_mutex);
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN)
-+     {
-+-      key.duration = (unsigned int) (GetTimeMs() - m_buttontime);
-++      unsigned int duration = (unsigned int) (GetTimeMs() - m_updateButtontime);
-++      key.duration = (unsigned int) (GetTimeMs() - m_initialButtontime);
-+ 
-+-      if (key.duration > m_configuration.iComboKeyTimeoutMs ||
-++      if (duration > m_configuration.iComboKeyTimeoutMs ||
-+           m_configuration.iComboKeyTimeoutMs == 0 ||
-+           m_iCurrentButton != m_configuration.comboKey ||
-+           bSendComboKey)
-+@@ -991,14 +993,15 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */)
-+         key.keycode = m_iCurrentButton;
-+ 
-+         m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-+-        m_buttontime = 0;
-++        m_initialButtontime = 0;
-++        m_updateButtontime = 0;
-+       }
-+     }
-+   }
-+ 
-+   if (key.keycode != CEC_USER_CONTROL_CODE_UNKNOWN)
-+   {
-+-    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key released: %s (%1x)", ToString(key.keycode), key.keycode);
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key released: %s (%1x) D:%dms", ToString(key.keycode), key.keycode, key.duration);
-+     QueueAddKey(key);
-+   }
-+ }
-+@@ -1012,7 +1015,7 @@ void CCECClient::AddKey(const cec_keypress &key)
-+     AddKey();
-+     return;
-+   }
-+-
-++  bool isrepeat = false;
-+   cec_keypress transmitKey(key);
-+   cec_user_control_code comboKey(m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5) ?
-+       m_configuration.comboKey : CEC_USER_CONTROL_CODE_STOP);
-+@@ -1035,22 +1038,27 @@ void CCECClient::AddKey(const cec_keypress &key)
-+         AddKey(true);
-+     }
-+ 
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x) current(%lx) duration(%d)", ToString(transmitKey.keycode), transmitKey.keycode, m_iCurrentButton, key.duration);
-++
-+     if (m_iCurrentButton == key.keycode)
-+     {
-+-      m_buttontime = GetTimeMs();
-++      m_updateButtontime = GetTimeMs();
-++      isrepeat = true;
-+     }
-+     else
-+     {
-+-      AddKey();
-++      if (m_iCurrentButton != transmitKey.keycode)
-++        AddKey();
-+       if (key.duration == 0)
-+       {
-+         m_iCurrentButton = transmitKey.keycode;
-+-        m_buttontime = m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN || key.duration > 0 ? 0 : GetTimeMs();
-++        m_initialButtontime = m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN || key.duration > 0 ? 0 : GetTimeMs();
-++        m_updateButtontime = m_initialButtontime;
-+       }
-+     }
-+   }
-+ 
-+-  if (key.keycode != comboKey || key.duration > 0)
-++  if (!isrepeat && (key.keycode != comboKey || key.duration > 0))
-+   {
-+     LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x)", ToString(transmitKey.keycode), transmitKey.keycode);
-+     QueueAddKey(transmitKey);
-+@@ -1074,32 +1082,34 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+   {
-+     CLockObject lock(m_mutex);
-+     uint64_t iNow = GetTimeMs();
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "%s T:%.3f", __FUNCTION__, iNow*1e-3);
-+     cec_user_control_code comboKey(m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5) ?
-+         m_configuration.comboKey : CEC_USER_CONTROL_CODE_STOP);
-+     uint32_t iTimeoutMs(m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5) ?
-+         m_configuration.iComboKeyTimeoutMs : CEC_DEFAULT_COMBO_TIMEOUT_MS);
-+ 
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_buttontime > iTimeoutMs) ||
-+-          (m_iCurrentButton != comboKey && iNow - m_buttontime > CEC_BUTTON_TIMEOUT)))
-++          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime > iTimeoutMs) ||
-++          (m_iCurrentButton != comboKey && iNow - m_updateButtontime > CEC_BUTTON_TIMEOUT)))
-+     {
-+-      key.duration = (unsigned int) (iNow - m_buttontime);
-++      key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+ 
-+       m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-+-      m_buttontime = 0;
-++      m_initialButtontime = 0;
-++      m_updateButtontime = 0;
-+     }
-+     else
-+     {
-+       // time when this keypress will be released and we'd like to be called again
-+       unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton == comboKey && iTimeoutMs > 0)
-+-        timeout = iTimeoutMs - (iNow - m_buttontime) + 1;
-++        timeout = iTimeoutMs - (iNow - m_updateButtontime) + 1;
-+       else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey)
-+-        timeout = CEC_BUTTON_TIMEOUT - (iNow - m_buttontime) + 1;
-++        timeout = CEC_BUTTON_TIMEOUT - (iNow - m_updateButtontime) + 1;
-+       if (timeout > CEC_PROCESSOR_SIGNAL_WAIT_TIME)
-+       {
-+-        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_buttontime*1e-3, CEC_BUTTON_TIMEOUT*1e-3, m_iCurrentButton);
-++        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_updateButtontime*1e-3, CEC_BUTTON_TIMEOUT*1e-3, m_iCurrentButton);
-+         timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       }
-+       return timeout;
-+diff --git a/src/libcec/CECClient.h b/src/libcec/CECClient.h
-+index c9ce5e3..611c68b 100644
-+--- a/src/libcec/CECClient.h
-++++ b/src/libcec/CECClient.h
-+@@ -404,7 +404,8 @@ namespace CEC
-+     PLATFORM::CMutex      m_mutex;                             /**< mutex for changes to this instance */
-+     PLATFORM::CMutex      m_cbMutex;                           /**< mutex that is held when doing anything with callbacks */
-+     cec_user_control_code m_iCurrentButton;                    /**< the control code of the button that's currently held down (if any) */
-+-    int64_t               m_buttontime;                        /**< the timestamp when the button was pressed (in seconds since epoch), or 0 if none was pressed. */
-++    int64_t               m_initialButtontime;                 /**< the timestamp when the button was initially pressed (in seconds since epoch), or 0 if none was pressed. */
-++    int64_t               m_updateButtontime;                  /**< the timestamp when the button was updated (in seconds since epoch), or 0 if none was pressed. */
-+     int64_t               m_iPreventForwardingPowerOffCommand; /**< prevent forwarding standby commands until this time */
-+     int64_t               m_iLastKeypressTime;                 /**< last time a key press was sent to the client */
-+     cec_keypress          m_lastKeypress;                      /**< the last key press that was sent to the client */
-+-- 
-+1.9.1
-+
-+
-+From 273ead6980b69eddf98810eb1eb33d94a7d74fce Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Tue, 28 Oct 2014 00:09:18 +0000
-+Subject: [PATCH 3/6] Support repeating button presses with configurable repeat
-+ rate
-+
-+---
-+ include/cectypes.h                               |   6 ++
-+ src/libcec/CECClient.cpp                         | 100 +++++++++++++++++++----
-+ src/libcec/CECClient.h                           |   6 +-
-+ src/libcec/implementations/CECCommandHandler.cpp |   2 +-
-+ 4 files changed, 96 insertions(+), 18 deletions(-)
-+
-+diff --git a/include/cectypes.h b/include/cectypes.h
-+index acff259..8f098ef 100644
-+--- a/include/cectypes.h
-++++ b/include/cectypes.h
-+@@ -1493,6 +1493,8 @@ struct libcec_configuration
-+                                                    XXX changed meaning in 2.2.0 to not break binary compatibility. next major (3.0) release will fix it in a nicer way */
-+   cec_user_control_code comboKey;             /*!< key code that initiates combo keys. defaults to CEC_USER_CONTROL_CODE_F1_BLUE. CEC_USER_CONTROL_CODE_UNKNOWN to disable. added in 2.0.5 */
-+   uint32_t              iComboKeyTimeoutMs;   /*!< timeout until the combo key is sent as normal keypress */
-++  uint32_t              iButtonRepeatRateMs;  /*!< rate at which buttons autorepeat. 0 means rely on CEC device */
-++  uint32_t              iButtonReleaseDelayMs;/*!< duration after last update until a button is considered released */
-+ 
-+ #ifdef __cplusplus
-+    libcec_configuration(void) { Clear(); }
-+@@ -1527,6 +1529,8 @@ struct libcec_configuration
-+                  cecVersion                == other.cecVersion &&
-+                  adapterType               == other.adapterType &&
-+                  iDoubleTapTimeout50Ms     == other.iDoubleTapTimeout50Ms &&
-++                 iButtonRepeatRateMs       == other.iButtonRepeatRateMs &&
-++                 iButtonReleaseDelayMs     == other.iButtonReleaseDelayMs &&
-+                  (other.clientVersion <= LIBCEC_VERSION_TO_UINT(2, 0, 4) || comboKey            == other.comboKey) &&
-+                  (other.clientVersion <= LIBCEC_VERSION_TO_UINT(2, 0, 4) || iComboKeyTimeoutMs  == other.iComboKeyTimeoutMs) &&
-+                  (other.clientVersion <  LIBCEC_VERSION_TO_UINT(2, 1, 0) || bPowerOnScreensaver == other.bPowerOnScreensaver));
-+@@ -1567,6 +1571,8 @@ struct libcec_configuration
-+     iDoubleTapTimeout50Ms =           CEC_DOUBLE_TAP_TIMEOUT_50_MS;
-+     comboKey =                        CEC_USER_CONTROL_CODE_STOP;
-+     iComboKeyTimeoutMs =              CEC_DEFAULT_COMBO_TIMEOUT_MS;
-++    iButtonRepeatRateMs =             0;
-++    iButtonReleaseDelayMs =           CEC_BUTTON_TIMEOUT;
-+ 
-+     memset(strDeviceName, 0, 13);
-+     deviceTypes.Clear();
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index e7935b9..598628d 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -56,6 +56,10 @@ CCECClient::CCECClient(CCECProcessor *processor, const libcec_configuration &con
-+     m_iCurrentButton(CEC_USER_CONTROL_CODE_UNKNOWN),
-+     m_initialButtontime(0),
-+     m_updateButtontime(0),
-++    m_repeatButtontime(0),
-++    m_releaseButtontime(0),
-++    m_pressedButtoncount(0),
-++    m_releasedButtoncount(0),
-+     m_iPreventForwardingPowerOffCommand(0),
-+     m_iLastKeypressTime(0)
-+ {
-+@@ -851,6 +855,9 @@ bool CCECClient::GetCurrentConfiguration(libcec_configuration &configuration)
-+   configuration.bMonitorOnly              = m_configuration.bMonitorOnly;
-+   configuration.cecVersion                = m_configuration.cecVersion;
-+   configuration.adapterType               = m_configuration.adapterType;
-++  configuration.iDoubleTapTimeout50Ms     = m_configuration.iDoubleTapTimeout50Ms;
-++  configuration.iButtonRepeatRateMs       = m_configuration.iButtonRepeatRateMs;
-++  configuration.iButtonReleaseDelayMs     = m_configuration.iButtonReleaseDelayMs;
-+ 
-+   return true;
-+ }
-+@@ -894,6 +901,9 @@ bool CCECClient::SetConfiguration(const libcec_configuration &configuration)
-+     m_configuration.cecVersion                 = configuration.cecVersion;
-+     m_configuration.adapterType                = configuration.adapterType;
-+     m_configuration.iDoubleTapTimeout50Ms      = configuration.iDoubleTapTimeout50Ms;
-++    m_configuration.iButtonRepeatRateMs        = configuration.iButtonRepeatRateMs;
-++    m_configuration.iButtonReleaseDelayMs      = configuration.iButtonReleaseDelayMs;
-++
-+     m_configuration.deviceTypes.Add(configuration.deviceTypes[0]);
-+ 
-+     if (m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5))
-+@@ -950,6 +960,7 @@ bool CCECClient::SetConfiguration(const libcec_configuration &configuration)
-+     primary->ActivateSource();
-+   }
-+ 
-++  LIB_CEC->AddLog(CEC_LOG_DEBUG, "%s: %d:%d:%d", __FUNCTION__, DoubleTapTimeoutMS(), m_configuration.iButtonRepeatRateMs, m_configuration.iButtonReleaseDelayMs);
-+   return true;
-+ }
-+ 
-+@@ -973,11 +984,15 @@ void CCECClient::AddCommand(const cec_command &command)
-+   }
-+ }
-+ 
-+-void CCECClient::AddKey(bool bSendComboKey /* = false */)
-++void CCECClient::AddKey(bool bSendComboKey /* = false */, bool bButtonRelease /* = false */)
-+ {
-+   cec_keypress key;
-+   key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+ 
-++  // we ignore button releases when supporting repeating keys
-++  if (bButtonRelease && m_configuration.iButtonRepeatRateMs && m_configuration.iButtonReleaseDelayMs)
-++    return;
-++
-+   {
-+     CLockObject lock(m_mutex);
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN)
-+@@ -995,6 +1010,10 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */)
-+         m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-+         m_initialButtontime = 0;
-+         m_updateButtontime = 0;
-++        m_repeatButtontime = 0;
-++        m_releaseButtontime = 0;
-++        m_pressedButtoncount = 0;
-++        m_releasedButtoncount = 0;
-+       }
-+     }
-+   }
-+@@ -1012,6 +1031,7 @@ void CCECClient::AddKey(const cec_keypress &key)
-+       key.keycode < CEC_USER_CONTROL_CODE_SELECT)
-+   {
-+     // send back the previous key if there is one
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "Unexpected key %s (%1x) D:%dms", ToString(key.keycode), key.keycode, key.duration);
-+     AddKey();
-+     return;
-+   }
-+@@ -1035,7 +1055,10 @@ void CCECClient::AddKey(const cec_keypress &key)
-+         transmitKey.keycode = CEC_USER_CONTROL_CODE_DOT;
-+       // default, send back the previous key
-+       else
-++      {
-++        LIB_CEC->AddLog(CEC_LOG_DEBUG, "Combo key %s (%1x) D%dms:", ToString(key.keycode), key.keycode, key.duration);
-+         AddKey(true);
-++      }
-+     }
-+ 
-+     LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x) current(%lx) duration(%d)", ToString(transmitKey.keycode), transmitKey.keycode, m_iCurrentButton, key.duration);
-+@@ -1043,17 +1066,44 @@ void CCECClient::AddKey(const cec_keypress &key)
-+     if (m_iCurrentButton == key.keycode)
-+     {
-+       m_updateButtontime = GetTimeMs();
-+-      isrepeat = true;
-++      m_releaseButtontime = m_updateButtontime + (m_configuration.iButtonReleaseDelayMs ? m_configuration.iButtonReleaseDelayMs : CEC_BUTTON_TIMEOUT);
-++      // want to have seen some updated before considering a repeat
-++      if (m_configuration.iButtonRepeatRateMs)
-++      {
-++        if (!m_repeatButtontime && m_pressedButtoncount > 1)
-++          m_repeatButtontime = m_initialButtontime + DoubleTapTimeoutMS();
-++        isrepeat = true;
-++      }
-++      m_pressedButtoncount++;
-+     }
-+     else
-+     {
-+       if (m_iCurrentButton != transmitKey.keycode)
-++      {
-++        LIB_CEC->AddLog(CEC_LOG_DEBUG, "Changed key %s (%1x) D:%dms cur:%lx", ToString(transmitKey.keycode), transmitKey.keycode, transmitKey.duration, m_iCurrentButton);
-+         AddKey();
-++      }
-+       if (key.duration == 0)
-+       {
-+         m_iCurrentButton = transmitKey.keycode;
-+-        m_initialButtontime = m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN || key.duration > 0 ? 0 : GetTimeMs();
-+-        m_updateButtontime = m_initialButtontime;
-++        if (m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN)
-++        {
-++          m_initialButtontime = 0;
-++          m_updateButtontime = 0;
-++          m_repeatButtontime = 0;
-++          m_releaseButtontime = 0;
-++          m_pressedButtoncount = 0;
-++          m_releasedButtoncount = 0;
-++        }
-++        else
-++        {
-++          m_initialButtontime = GetTimeMs();
-++          m_updateButtontime = m_initialButtontime;
-++          m_repeatButtontime = 0; // set this on next update
-++          m_releaseButtontime = m_initialButtontime + (m_configuration.iButtonReleaseDelayMs ? m_configuration.iButtonReleaseDelayMs : CEC_BUTTON_TIMEOUT);
-++          m_pressedButtoncount = 1;
-++          m_releasedButtoncount = 0;
-++        }
-+       }
-+     }
-+   }
-+@@ -1072,12 +1122,16 @@ void CCECClient::SetCurrentButton(const cec_user_control_code iButtonCode)
-+   key.duration = 0;
-+   key.keycode = iButtonCode;
-+ 
-++  LIB_CEC->AddLog(CEC_LOG_DEBUG, "SetCurrentButton %s (%1x) D:%dms cur:%lx", ToString(key.keycode), key.keycode, key.duration);
-+   AddKey(key);
-+ }
-+ 
-+ uint16_t CCECClient::CheckKeypressTimeout(void)
-+ {
-++  // time when we'd like to be called again
-++  unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   cec_keypress key;
-++  key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+ 
-+   {
-+     CLockObject lock(m_mutex);
-+@@ -1089,8 +1143,8 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+         m_configuration.iComboKeyTimeoutMs : CEC_DEFAULT_COMBO_TIMEOUT_MS);
-+ 
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime > iTimeoutMs) ||
-+-          (m_iCurrentButton != comboKey && iNow - m_updateButtontime > CEC_BUTTON_TIMEOUT)))
-++          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs) ||
-++          (m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)))
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+@@ -1098,27 +1152,41 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+       m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-+       m_initialButtontime = 0;
-+       m_updateButtontime = 0;
-++      m_repeatButtontime = 0;
-++      m_releaseButtontime = 0;
-++      m_pressedButtoncount = 0;
-++      m_releasedButtoncount = 0;
-++    }
-++    else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-++          (m_iCurrentButton != comboKey && m_repeatButtontime && iNow >= (uint64_t)m_repeatButtontime))
-++    {
-++      key.duration = 0;
-++      key.keycode = m_iCurrentButton;
-++      m_repeatButtontime = iNow + m_configuration.iButtonRepeatRateMs;
-++      timeout = std::min((uint64_t)timeout, m_repeatButtontime - iNow);
-+     }
-+     else
-+     {
-+-      // time when this keypress will be released and we'd like to be called again
-+-      unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton == comboKey && iTimeoutMs > 0)
-+-        timeout = iTimeoutMs - (iNow - m_updateButtontime) + 1;
-+-      else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey)
-+-        timeout = CEC_BUTTON_TIMEOUT - (iNow - m_updateButtontime) + 1;
-++        timeout = std::min((uint64_t)timeout, m_updateButtontime - iNow + iTimeoutMs);
-++      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey && m_releaseButtontime)
-++        timeout = std::min((uint64_t)timeout, m_releaseButtontime - iNow);
-++      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey && m_repeatButtontime)
-++        timeout = std::min((uint64_t)timeout, m_repeatButtontime - iNow);
-+       if (timeout > CEC_PROCESSOR_SIGNAL_WAIT_TIME)
-+       {
-+-        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_updateButtontime*1e-3, CEC_BUTTON_TIMEOUT*1e-3, m_iCurrentButton);
-++        LIB_CEC->AddLog(CEC_LOG_ERROR, "Unexpected timeout: %d (%.3f %.3f %.3f) k:%02x", timeout, iNow*1e-3, m_updateButtontime*1e-3, m_releaseButtontime*1e-3, m_iCurrentButton);
-+         timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       }
-+-      return timeout;
-+     }
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key %s: %s (%1x) timeout:%dms (rel:%d,rep:%d,prs:%d,rel:%d)", key.keycode == CEC_USER_CONTROL_CODE_UNKNOWN ? "idle" : key.duration ? "released" : "repeated",
-++        ToString(m_iCurrentButton), m_iCurrentButton, timeout, (int)(m_releaseButtontime ? m_releaseButtontime - iNow : 0), (int)(m_repeatButtontime ? m_repeatButtontime - iNow : 0), m_pressedButtoncount, m_releasedButtoncount);
-+   }
-+ 
-+-  LIB_CEC->AddLog(CEC_LOG_DEBUG, "key auto-released: %s (%1x)", ToString(key.keycode), key.keycode);
-+-  QueueAddKey(key);
-+-  return CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-++  if (key.keycode != CEC_USER_CONTROL_CODE_UNKNOWN)
-++    QueueAddKey(key);
-++
-++  return timeout;
-+ }
-+ 
-+ bool CCECClient::EnableCallbacks(void *cbParam, ICECCallbacks *callbacks)
-+diff --git a/src/libcec/CECClient.h b/src/libcec/CECClient.h
-+index 611c68b..adeb5af 100644
-+--- a/src/libcec/CECClient.h
-++++ b/src/libcec/CECClient.h
-+@@ -269,7 +269,7 @@ namespace CEC
-+     // callbacks
-+     virtual void                  Alert(const libcec_alert type, const libcec_parameter &param) { QueueAlert(type, param); }
-+     virtual void                  AddLog(const cec_log_message &message) { QueueAddLog(message); }
-+-    virtual void                  AddKey(bool bSendComboKey = false);
-++    virtual void                  AddKey(bool bSendComboKey = false, bool bButtonRelease = false);
-+     virtual void                  AddKey(const cec_keypress &key);
-+     virtual void                  SetCurrentButton(const cec_user_control_code iButtonCode);
-+     virtual uint16_t              CheckKeypressTimeout(void);
-+@@ -406,6 +406,10 @@ namespace CEC
-+     cec_user_control_code m_iCurrentButton;                    /**< the control code of the button that's currently held down (if any) */
-+     int64_t               m_initialButtontime;                 /**< the timestamp when the button was initially pressed (in seconds since epoch), or 0 if none was pressed. */
-+     int64_t               m_updateButtontime;                  /**< the timestamp when the button was updated (in seconds since epoch), or 0 if none was pressed. */
-++    int64_t               m_repeatButtontime;                  /**< the timestamp when the button will next repeat (in seconds since epoch), or 0 if repeat is disabled. */
-++    int64_t               m_releaseButtontime;                 /**< the timestamp when the button will be released (in seconds since epoch), or 0 if none was pressed. */
-++    int32_t               m_pressedButtoncount;                /**< the number of times a button released message has been seen for this press. */
-++    int32_t               m_releasedButtoncount;               /**< the number of times a button pressed message has been seen for this press. */
-+     int64_t               m_iPreventForwardingPowerOffCommand; /**< prevent forwarding standby commands until this time */
-+     int64_t               m_iLastKeypressTime;                 /**< last time a key press was sent to the client */
-+     cec_keypress          m_lastKeypress;                      /**< the last key press that was sent to the client */
-+diff --git a/src/libcec/implementations/CECCommandHandler.cpp b/src/libcec/implementations/CECCommandHandler.cpp
-+index 6d6244e..d64186f 100644
-+--- a/src/libcec/implementations/CECCommandHandler.cpp
-++++ b/src/libcec/implementations/CECCommandHandler.cpp
-+@@ -770,7 +770,7 @@ int CCECCommandHandler::HandleUserControlRelease(const cec_command &command)
-+ 
-+   CECClientPtr client = m_processor->GetClient(command.destination);
-+   if (client)
-+-    client->AddKey();
-++    client->AddKey(false, true);
-+ 
-+   return COMMAND_HANDLED;
-+ }
-+-- 
-+1.9.1
-+
-+
-+From 3336d0827f7fd159430f3431642b07090c06c869 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Tue, 28 Oct 2014 01:21:35 +0000
-+Subject: [PATCH 4/6] Skip double press removal. It is handled through other
-+ means.
-+
-+---
-+ src/libcec/CECClient.cpp | 18 +-----------------
-+ src/libcec/CECClient.h   |  2 --
-+ 2 files changed, 1 insertion(+), 19 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index 598628d..dccd874 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -60,11 +60,8 @@ CCECClient::CCECClient(CCECProcessor *processor, const libcec_configuration &con
-+     m_releaseButtontime(0),
-+     m_pressedButtoncount(0),
-+     m_releasedButtoncount(0),
-+-    m_iPreventForwardingPowerOffCommand(0),
-+-    m_iLastKeypressTime(0)
-++    m_iPreventForwardingPowerOffCommand(0)
-+ {
-+-  m_lastKeypress.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+-  m_lastKeypress.duration = 0;
-+   m_configuration.Clear();
-+   // set the initial configuration
-+   SetConfiguration(configuration);
-+@@ -1647,20 +1644,7 @@ void CCECClient::CallbackAddKey(const cec_keypress &key)
-+ {
-+   CLockObject lock(m_cbMutex);
-+   if (m_configuration.callbacks && m_configuration.callbacks->CBCecKeyPress)
-+-  {
-+-    // prevent double taps
-+-    int64_t now = GetTimeMs();
-+-    if (m_lastKeypress.keycode != key.keycode ||
-+-        key.duration > 0 ||
-+-        now - m_iLastKeypressTime >= DoubleTapTimeoutMS())
-+-    {
-+-      // no double tap
-+-      if (key.duration == 0)
-+-        m_iLastKeypressTime = now;
-+-      m_lastKeypress = key;
-+       m_configuration.callbacks->CBCecKeyPress(m_configuration.callbackParam, key);
-+-    }
-+-  }
-+ }
-+ 
-+ void CCECClient::CallbackAddLog(const cec_log_message &message)
-+diff --git a/src/libcec/CECClient.h b/src/libcec/CECClient.h
-+index adeb5af..43a713b 100644
-+--- a/src/libcec/CECClient.h
-++++ b/src/libcec/CECClient.h
-+@@ -411,8 +411,6 @@ namespace CEC
-+     int32_t               m_pressedButtoncount;                /**< the number of times a button released message has been seen for this press. */
-+     int32_t               m_releasedButtoncount;               /**< the number of times a button pressed message has been seen for this press. */
-+     int64_t               m_iPreventForwardingPowerOffCommand; /**< prevent forwarding standby commands until this time */
-+-    int64_t               m_iLastKeypressTime;                 /**< last time a key press was sent to the client */
-+-    cec_keypress          m_lastKeypress;                      /**< the last key press that was sent to the client */
-+     PLATFORM::SyncedBuffer<CCallbackWrap*> m_callbackCalls;
-+   };
-+ }
-+-- 
-+1.9.1
-+
-+
-+From 0dd0234f620a546bfa843172648383f83d88088c Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Mon, 3 Nov 2014 23:28:04 +0000
-+Subject: [PATCH 5/6] Pass through duration on all button repeats
-+
-+---
-+ src/libcec/CECClient.cpp | 34 ++++++++++++++++++++++++----------
-+ 1 file changed, 24 insertions(+), 10 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index dccd874..1946148 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -986,10 +986,6 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */, bool bButtonRelease /*
-+   cec_keypress key;
-+   key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+ 
-+-  // we ignore button releases when supporting repeating keys
-+-  if (bButtonRelease && m_configuration.iButtonRepeatRateMs && m_configuration.iButtonReleaseDelayMs)
-+-    return;
-+-
-+   {
-+     CLockObject lock(m_mutex);
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN)
-+@@ -1015,6 +1011,10 @@ void CCECClient::AddKey(bool bSendComboKey /* = false */, bool bButtonRelease /*
-+     }
-+   }
-+ 
-++  // we don't forward releases when supporting repeating keys
-++  if (bButtonRelease && m_configuration.iButtonRepeatRateMs)
-++    return;
-++
-+   if (key.keycode != CEC_USER_CONTROL_CODE_UNKNOWN)
-+   {
-+     LIB_CEC->AddLog(CEC_LOG_DEBUG, "key released: %s (%1x) D:%dms", ToString(key.keycode), key.keycode, key.duration);
-+@@ -1107,7 +1107,7 @@ void CCECClient::AddKey(const cec_keypress &key)
-+ 
-+   if (!isrepeat && (key.keycode != comboKey || key.duration > 0))
-+   {
-+-    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x)", ToString(transmitKey.keycode), transmitKey.keycode);
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key pressed: %s (%1x, %d)", ToString(transmitKey.keycode), transmitKey.keycode, transmitKey.duration);
-+     QueueAddKey(transmitKey);
-+   }
-+ }
-+@@ -1129,6 +1129,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+   unsigned int timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+   cec_keypress key;
-+   key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-++  key.duration = 0;
-+ 
-+   {
-+     CLockObject lock(m_mutex);
-+@@ -1140,8 +1141,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+         m_configuration.iComboKeyTimeoutMs : CEC_DEFAULT_COMBO_TIMEOUT_MS);
-+ 
-+     if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          ((m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs) ||
-+-          (m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)))
-++          m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs)
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+@@ -1155,9 +1155,23 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+       m_releasedButtoncount = 0;
-+     }
-+     else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-++          m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)
-++    {
-++      key.duration = (unsigned int) (iNow - m_initialButtontime);
-++      key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-++
-++      m_iCurrentButton = CEC_USER_CONTROL_CODE_UNKNOWN;
-++      m_initialButtontime = 0;
-++      m_updateButtontime = 0;
-++      m_repeatButtontime = 0;
-++      m_releaseButtontime = 0;
-++      m_pressedButtoncount = 0;
-++      m_releasedButtoncount = 0;
-++    }
-++    else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+           (m_iCurrentButton != comboKey && m_repeatButtontime && iNow >= (uint64_t)m_repeatButtontime))
-+     {
-+-      key.duration = 0;
-++      key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+       m_repeatButtontime = iNow + m_configuration.iButtonRepeatRateMs;
-+       timeout = std::min((uint64_t)timeout, m_repeatButtontime - iNow);
-+@@ -1176,8 +1190,8 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+         timeout = CEC_PROCESSOR_SIGNAL_WAIT_TIME;
-+       }
-+     }
-+-    LIB_CEC->AddLog(CEC_LOG_DEBUG, "key %s: %s (%1x) timeout:%dms (rel:%d,rep:%d,prs:%d,rel:%d)", key.keycode == CEC_USER_CONTROL_CODE_UNKNOWN ? "idle" : key.duration ? "released" : "repeated",
-+-        ToString(m_iCurrentButton), m_iCurrentButton, timeout, (int)(m_releaseButtontime ? m_releaseButtontime - iNow : 0), (int)(m_repeatButtontime ? m_repeatButtontime - iNow : 0), m_pressedButtoncount, m_releasedButtoncount);
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "Key %s: %s (duration:%d) (%1x) timeout:%dms (rel:%d,rep:%d,prs:%d,rel:%d)", ToString(m_iCurrentButton), key.keycode == CEC_USER_CONTROL_CODE_UNKNOWN ? "idle" : m_repeatButtontime ? "repeated" : "released", key.duration,
-++        m_iCurrentButton, timeout, (int)(m_releaseButtontime ? m_releaseButtontime - iNow : 0), (int)(m_repeatButtontime ? m_repeatButtontime - iNow : 0), m_pressedButtoncount, m_releasedButtoncount);
-+   }
-+ 
-+   if (key.keycode != CEC_USER_CONTROL_CODE_UNKNOWN)
-+-- 
-+1.9.1
-+
-+
-+From 1ea01f59d8186d4d53af41961aaccbbc11651115 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Wed, 5 Nov 2014 21:04:25 +0000
-+Subject: [PATCH 6/6] squash: Fix for stop needing to be pressed twice
-+
-+---
-+ src/libcec/CECClient.cpp | 17 ++++++++---------
-+ 1 file changed, 8 insertions(+), 9 deletions(-)
-+
-+diff --git a/src/libcec/CECClient.cpp b/src/libcec/CECClient.cpp
-+index 1946148..f4f114b 100644
-+--- a/src/libcec/CECClient.cpp
-++++ b/src/libcec/CECClient.cpp
-+@@ -1131,6 +1131,8 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+   key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+   key.duration = 0;
-+ 
-++  if (m_iCurrentButton == CEC_USER_CONTROL_CODE_UNKNOWN)
-++    return timeout;
-+   {
-+     CLockObject lock(m_mutex);
-+     uint64_t iNow = GetTimeMs();
-+@@ -1140,8 +1142,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+     uint32_t iTimeoutMs(m_configuration.clientVersion >= LIBCEC_VERSION_TO_UINT(2, 0, 5) ?
-+         m_configuration.iComboKeyTimeoutMs : CEC_DEFAULT_COMBO_TIMEOUT_MS);
-+ 
-+-    if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs)
-++    if (m_iCurrentButton == comboKey && iTimeoutMs > 0 && iNow - m_updateButtontime >= iTimeoutMs)
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+@@ -1154,8 +1155,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+       m_pressedButtoncount = 0;
-+       m_releasedButtoncount = 0;
-+     }
-+-    else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)
-++    else if (m_iCurrentButton != comboKey && m_releaseButtontime && iNow >= (uint64_t)m_releaseButtontime)
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = CEC_USER_CONTROL_CODE_UNKNOWN;
-+@@ -1168,8 +1168,7 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+       m_pressedButtoncount = 0;
-+       m_releasedButtoncount = 0;
-+     }
-+-    else if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN &&
-+-          (m_iCurrentButton != comboKey && m_repeatButtontime && iNow >= (uint64_t)m_repeatButtontime))
-++    else if (m_iCurrentButton != comboKey && m_repeatButtontime && iNow >= (uint64_t)m_repeatButtontime)
-+     {
-+       key.duration = (unsigned int) (iNow - m_initialButtontime);
-+       key.keycode = m_iCurrentButton;
-+@@ -1178,11 +1177,11 @@ uint16_t CCECClient::CheckKeypressTimeout(void)
-+     }
-+     else
-+     {
-+-      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton == comboKey && iTimeoutMs > 0)
-++      if (m_iCurrentButton == comboKey && iTimeoutMs > 0)
-+         timeout = std::min((uint64_t)timeout, m_updateButtontime - iNow + iTimeoutMs);
-+-      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey && m_releaseButtontime)
-++      if (m_iCurrentButton != comboKey && m_releaseButtontime)
-+         timeout = std::min((uint64_t)timeout, m_releaseButtontime - iNow);
-+-      if (m_iCurrentButton != CEC_USER_CONTROL_CODE_UNKNOWN && m_iCurrentButton != comboKey && m_repeatButtontime)
-++      if (m_iCurrentButton != comboKey && m_repeatButtontime)
-+         timeout = std::min((uint64_t)timeout, m_repeatButtontime - iNow);
-+       if (timeout > CEC_PROCESSOR_SIGNAL_WAIT_TIME)
-+       {
-+-- 
-+1.9.1
-+
-
-From bfc97f9146e8ac70fb03c439a4cf1a9a3135ea9b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 28 Oct 2014 00:19:40 +0000
-Subject: [PATCH 31/93] [cec] Add settings for configuring button repeats
-
----
- addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
- system/peripherals.xml                              |  4 +++-
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp   | 16 ++++++++++++++++
- 3 files changed, 34 insertions(+), 1 deletion(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index cc486da..f9b8277 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -18177,3 +18177,18 @@ msgstr ""
- msgctxt "#38009"
- msgid "%i dB"
- msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38050"
-+msgid "Remote button press delay before repeating (ms)"
-+msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38051"
-+msgid "Remote button press repeat rate (ms)"
-+msgstr ""
-+
-+#: system/peripherals.xml
-+msgctxt "#38052"
-+msgid "Remote button press release time (ms)"
-+msgstr ""
-diff --git a/system/peripherals.xml b/system/peripherals.xml
-index ec3c3fe..c3dbae0 100644
---- a/system/peripherals.xml
-+++ b/system/peripherals.xml
-@@ -31,7 +31,9 @@
-     <setting key="device_type" type="int" value="1" configurable="0" />
-     <setting key="wake_devices_advanced" type="string" value="" configurable="0" />
-     <setting key="standby_devices_advanced" type="string" value="" configurable="0" />
--    <setting key="double_tap_timeout_ms" type="int" min="0" value="300" configurable="0" />
-+    <setting key="double_tap_timeout_ms" type="int" min="50" max="1000" step="50" value="300" label="38050" order="16" />
-+    <setting key="button_repeat_rate_ms" type="int" min="0" max="250" step="10" value="0" label="38051" order="17" />
-+    <setting key="button_release_delay_ms" type="int" min="0" max="500" step="50" value="0" label="38052" order="18" />
-   </peripheral>
- 
-   <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index e0d8dae..f738c84 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -1278,6 +1278,20 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
-   m_configuration.bSendInactiveSource = config.bSendInactiveSource;
-   bChanged |= SetSetting("send_inactive_source", m_configuration.bSendInactiveSource == 1);
- 
-+#if defined(CEC_DOUBLE_TAP_TIMEOUT_MS_OLD)
-+  m_configuration.iDoubleTapTimeout50Ms = config.iDoubleTapTimeout50Ms;
-+  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeout50Ms * 50);
-+#else
-+  m_configuration.iDoubleTapTimeoutMs = config.iDoubleTapTimeoutMs;
-+  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeoutMs;
-+#endif
-+
-+  m_configuration.iButtonRepeatRateMs = config.iButtonRepeatRateMs;
-+  bChanged |= SetSetting("button_repeat_rate_ms", (int)m_configuration.iButtonRepeatRateMs);
-+
-+  m_configuration.iButtonReleaseDelayMs = config.iButtonReleaseDelayMs;
-+  bChanged |= SetSetting("button_release_delay_ms", (int)m_configuration.iButtonReleaseDelayMs);
-+
-   m_configuration.iFirmwareVersion = config.iFirmwareVersion;
-   m_configuration.bShutdownOnStandby = config.bShutdownOnStandby;
- 
-@@ -1382,6 +1396,8 @@ void CPeripheralCecAdapter::SetConfigurationFromSettings(void)
-   // backwards compatibility. will be removed once the next major release of libCEC is out
-   m_configuration.iDoubleTapTimeoutMs = GetSettingInt("double_tap_timeout_ms");
- #endif
-+  m_configuration.iButtonRepeatRateMs = GetSettingInt("button_repeat_rate_ms");
-+  m_configuration.iButtonReleaseDelayMs = GetSettingInt("button_release_delay_ms");
- 
-   if (GetSettingBool("pause_playback_on_deactivate"))
-   {
-
-From af63fad05fc2f6c24354c7acd08cd685ff376e28 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 3 Nov 2014 23:17:46 +0000
-Subject: [PATCH 32/93] [cec] Don't discard buttons when repeat mode is enabled
-
----
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index f738c84..58d7d0d 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -770,7 +770,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-   CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
- 
-   CSingleLock lock(m_critSection);
--  if (key.iDuration > 0)
-+  // avoid the queue getting too long
-+  if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
-+    return;
-+  if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
-   {
-     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
-     {
-
-From 42155d82d06a1deea72d4c3092315ea1110c6cb7 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 4 Nov 2014 18:50:00 +0000
-Subject: [PATCH 33/93] [cec] Temp - more logging
-
----
- xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 8 +++++++-
- 1 file changed, 7 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index 58d7d0d..dfba61a 100644
---- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-+++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-@@ -767,12 +767,15 @@ void CPeripheralCecAdapter::GetNextKey(void)
- 
- void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
- {
--  CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
-+  CLog::Log(LOGDEBUG, "%s - received key %2x duration %d (rep:%d size:%d)", __FUNCTION__, key.iButton, key.iDuration, m_configuration.iButtonRepeatRateMs, m_buttonQueue.size());
- 
-   CSingleLock lock(m_critSection);
-   // avoid the queue getting too long
-   if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
-+  {
-+    CLog::Log(LOGDEBUG, "%s - discarded key %2x", __FUNCTION__, key.iButton);
-     return;
-+  }
-   if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
-   {
-     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
-@@ -781,6 +784,7 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-       if (m_bHasButton)
-         m_currentButton.iDuration = key.iDuration;
-       // ignore this one, since it's already been handled by xbmc
-+      CLog::Log(LOGDEBUG, "%s - ignored key %2x", __FUNCTION__, key.iButton);
-       return;
-     }
-     // if we received a keypress with a duration set, try to find the same one without a duration set, and replace it
-@@ -791,6 +795,7 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-         if ((*it).iDuration == 0)
-         {
-           // replace this entry
-+          CLog::Log(LOGDEBUG, "%s - replaced key %2x", __FUNCTION__, key.iButton);
-           (*it).iDuration = key.iDuration;
-           return;
-         }
-@@ -800,6 +805,7 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
-     }
-   }
- 
-+  CLog::Log(LOGDEBUG, "%s - added key %2x", __FUNCTION__, key.iButton);
-   m_buttonQueue.push_back(key);
- }
- 
-
-From f8d6e97fedcb9184af7dfc8a976815892faa7784 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 15 Nov 2014 12:03:34 +0000
-Subject: [PATCH 34/93] [dvdplayer] Add lock for player creation
-
----
- xbmc/cores/dvdplayer/DVDPlayer.cpp | 3 +++
- xbmc/cores/dvdplayer/DVDPlayer.h   | 1 +
- 2 files changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayer.cpp b/xbmc/cores/dvdplayer/DVDPlayer.cpp
-index 4b09e8f..abcb8d2 100644
---- a/xbmc/cores/dvdplayer/DVDPlayer.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayer.cpp
-@@ -535,6 +535,7 @@ int CSelectionStreams::CountSource(StreamType type, StreamSource source) const
- 
- void CDVDPlayer::CreatePlayers()
- {
-+  CSingleLock lock(m_players_lock);
- #ifdef HAS_OMXPLAYER
-   bool omx_suitable = !OMXPlayerUnsuitable(m_HasVideo, m_HasAudio, m_pDemuxer, m_pInputStream, m_SelectionStreams);
-   if (m_omxplayer_mode != omx_suitable)
-@@ -566,6 +567,7 @@ void CDVDPlayer::CreatePlayers()
- 
- void CDVDPlayer::DestroyPlayers()
- {
-+  CSingleLock lock(m_players_lock);
-   if (!m_players_created)
-     return;
-   delete m_dvdPlayerVideo;
-@@ -4377,6 +4379,7 @@ double CDVDPlayer::GetQueueTime()
- 
- void CDVDPlayer::GetVideoStreamInfo(SPlayerVideoStreamInfo &info)
- {
-+  CSingleLock lock(m_players_lock);
-   info.bitrate = m_dvdPlayerVideo->GetVideoBitrate();
- 
-   std::string retVal;
-diff --git a/xbmc/cores/dvdplayer/DVDPlayer.h b/xbmc/cores/dvdplayer/DVDPlayer.h
-index 2f00647..b1418e3 100644
---- a/xbmc/cores/dvdplayer/DVDPlayer.h
-+++ b/xbmc/cores/dvdplayer/DVDPlayer.h
-@@ -567,4 +567,5 @@ protected:
-   // omxplayer variables
-   struct SOmxPlayerState m_OmxPlayerState;
-   bool m_omxplayer_mode;            // using omxplayer acceleration
-+  CCriticalSection m_players_lock;
- };
-
-From 2e80c975eb2d085f157ea328488aa7889c092f47 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 24 Nov 2014 22:07:25 +0000
-Subject: [PATCH 35/93] [dvdplayervideo] Prod decoder when in stills mode
-
-An asynchronous hardware decoder doesn't only produce output pictures when new packets arrive.
-In dvd stills mode give it a chance to return pictures that weren't ready when frame was decoded.
----
- xbmc/cores/dvdplayer/DVDPlayerVideo.cpp | 46 ++++++++++++++++++++-------------
- 1 file changed, 28 insertions(+), 18 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp b/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-index 379c541..b5777a1 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-@@ -313,7 +313,8 @@ void CDVDPlayerVideo::Process()
- 
-   while (!m_bStop)
-   {
--    int iQueueTimeOut = (int)(m_stalled ? frametime / 4 : frametime * 10) / 1000;
-+    bool bPictureWaiting = m_hints.stills && (m_pVideoCodec->Decode(NULL, 0, DVD_NOPTS_VALUE, DVD_NOPTS_VALUE) & VC_PICTURE);
-+    int iQueueTimeOut = (int)(bPictureWaiting ? 0 : (m_hints.stills || m_stalled) ? frametime / 4 : frametime * 10) / 1000;
-     int iPriority = (m_speed == DVD_PLAYSPEED_PAUSE && m_started) ? 1 : 0;
- 
-     CDVDMsg* pMsg;
-@@ -330,27 +331,36 @@ void CDVDPlayerVideo::Process()
-       if( iPriority )
-         continue;
- 
--      //Okey, start rendering at stream fps now instead, we are likely in a stillframe
--      if( !m_stalled )
-+      // check for picture waiting
-+      if (bPictureWaiting)
-       {
--        if(m_started)
--          CLog::Log(LOGINFO, "CDVDPlayerVideo - Stillframe detected, switching to forced %f fps", m_fFrameRate);
--        m_stalled = true;
--        pts+= frametime*4;
-+        // create a dummy demuxer packet to prod the decode logic
-+        pMsg = new CDVDMsgDemuxerPacket(CDVDDemuxUtils::AllocateDemuxPacket(0), false);
-       }
--
--      //Waiting timed out, output last picture
--      if( picture.iFlags & DVP_FLAG_ALLOCATED )
-+      else
-       {
--        //Remove interlaced flag before outputting
--        //no need to output this as if it was interlaced
--        picture.iFlags &= ~DVP_FLAG_INTERLACED;
--        picture.iFlags |= DVP_FLAG_NOSKIP;
--        OutputPicture(&picture, pts);
--        pts+= frametime;
--      }
-+        //Okey, start rendering at stream fps now instead, we are likely in a stillframe
-+        if( !m_stalled )
-+        {
-+          if(m_started)
-+            CLog::Log(LOGINFO, "CDVDPlayerVideo - Stillframe detected, switching to forced %f fps", m_fFrameRate);
-+          m_stalled = true;
-+          pts+= frametime*4;
-+        }
- 
--      continue;
-+        //Waiting timed out, output last picture
-+        if( picture.iFlags & DVP_FLAG_ALLOCATED )
-+        {
-+          //Remove interlaced flag before outputting
-+          //no need to output this as if it was interlaced
-+          picture.iFlags &= ~DVP_FLAG_INTERLACED;
-+          picture.iFlags |= DVP_FLAG_NOSKIP;
-+          OutputPicture(&picture, pts);
-+          pts+= frametime;
-+        }
-+
-+        continue;
-+      }
-     }
- 
-     if (pMsg->IsType(CDVDMsg::GENERAL_SYNCHRONIZE))
-
-From 1a4b613e9981829137c817baad127fda8e1e2823 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 27 Nov 2014 16:31:56 +0000
-Subject: [PATCH 36/93] [languageinvoker] Reduce priority of python threads
-
----
- xbmc/interfaces/generic/LanguageInvokerThread.cpp | 5 +++++
- 1 file changed, 5 insertions(+)
-
-diff --git a/xbmc/interfaces/generic/LanguageInvokerThread.cpp b/xbmc/interfaces/generic/LanguageInvokerThread.cpp
-index fcdd063..16f0c89 100644
---- a/xbmc/interfaces/generic/LanguageInvokerThread.cpp
-+++ b/xbmc/interfaces/generic/LanguageInvokerThread.cpp
-@@ -50,6 +50,11 @@ bool CLanguageInvokerThread::execute(const std::string &script, const std::vecto
-   m_args = arguments;
- 
-   Create();
-+  #ifdef TARGET_RASPBERRY_PI
-+  /* low prio */
-+  SetPriority(GetPriority()-1);
-+  #endif
-+
-   return true;
- }
- 
-
-From 73c6f413799cbb821f597253eb80457ee29a45f8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 14 Dec 2013 16:55:05 +0000
-Subject: [PATCH 37/93] logging: Add microsecond timer to log messages
-
----
- xbmc/utils/log.cpp | 17 +++++++++++++++--
- 1 file changed, 15 insertions(+), 2 deletions(-)
-
-diff --git a/xbmc/utils/log.cpp b/xbmc/utils/log.cpp
-index 3443f12..31c4a99 100644
---- a/xbmc/utils/log.cpp
-+++ b/xbmc/utils/log.cpp
-@@ -24,6 +24,7 @@
- #include "threads/Thread.h"
- #include "utils/StringUtils.h"
- #include "CompileInfo.h"
-+#include "utils/TimeUtils.cpp"
- 
- static const char* const levelNames[] =
- {"DEBUG", "INFO", "NOTICE", "WARNING", "ERROR", "SEVERE", "FATAL", "NONE"};
-@@ -198,19 +199,31 @@ void CLog::PrintDebugString(const std::string& line)
- 
- bool CLog::WriteLogString(int logLevel, const std::string& logString)
- {
-+#if defined(TARGET_LINUX)
-+  static const char* prefixFormat = "%02.2d:%02.2d:%02.2d %10.6f T:%" PRIu64" %7s: ";
-+#else
-   static const char* prefixFormat = "%02.2d:%02.2d:%02.2d T:%" PRIu64" %7s: ";
--
-+#endif
-   std::string strData(logString);
-   /* fixup newline alignment, number of spaces should equal prefix length */
-   StringUtils::Replace(strData, "\n", "\n                                            ");
- 
-   int hour, minute, second;
-   s_globals.m_platform.GetCurrentLocalTime(hour, minute, second);
--  
-+
-+#if defined(TARGET_LINUX)
-+  struct timespec now;
-+  clock_gettime(CLOCK_MONOTONIC, &now);
-+  float Now = now.tv_sec + now.tv_nsec * 1e-9;
-+#endif
-+
-   strData = StringUtils::Format(prefixFormat,
-                                   hour,
-                                   minute,
-                                   second,
-+#if defined(TARGET_LINUX)
-+                                  Now,
-+#endif
-                                   (uint64_t)CThread::GetCurrentThreadId(),
-                                   levelNames[logLevel]) + strData;
- 
-
-From dd959edaaae1f167e0979ac55d64e5d769127687 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 29 Nov 2014 15:25:16 +0000
-Subject: [PATCH 38/93] [rbp] hack: wait for splash to complete before changing
- hdmi mode
-
----
- xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp | 49 +++++++++++++++++++++++++
- 1 file changed, 49 insertions(+)
-
-diff --git a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-index ee29770..ff0d3e3 100644
---- a/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-+++ b/xbmc/windowing/egl/EGLNativeTypeRaspberryPI.cpp
-@@ -221,12 +221,61 @@ int CEGLNativeTypeRaspberryPI::AddUniqueResolution(RESOLUTION_INFO &res, std::ve
- }
- #endif
- 
-+#include <dirent.h>
-+
-+pid_t proc_find(const char* name)
-+{
-+    DIR* dir;
-+    struct dirent* ent;
-+    char buf[512];
-+
-+    long  pid;
-+    char pname[100] = {0,};
-+    char state;
-+    FILE *fp=NULL;
-+
-+    if (!(dir = opendir("/proc"))) {
-+        perror("can't open /proc");
-+        return -1;
-+    }
-+
-+    while((ent = readdir(dir)) != NULL) {
-+        long lpid = atol(ent->d_name);
-+        if(lpid < 0)
-+            continue;
-+        snprintf(buf, sizeof(buf), "/proc/%ld/stat", lpid);
-+        fp = fopen(buf, "r");
-+
-+        if (fp) {
-+            if ( (fscanf(fp, "%ld (%[^)]) %c", &pid, pname, &state)) != 3 ){
-+                printf("fscanf failed \n");
-+                fclose(fp);
-+                closedir(dir);
-+                return -1;
-+            }
-+            if (!strcmp(pname, name)) {
-+                fclose(fp);
-+                closedir(dir);
-+                return (pid_t)lpid;
-+            }
-+            fclose(fp);
-+        }
-+    }
-+
-+    closedir(dir);
-+    return -1;
-+}
-+
-+
- bool CEGLNativeTypeRaspberryPI::SetNativeResolution(const RESOLUTION_INFO &res)
- {
- #if defined(TARGET_RASPBERRY_PI)
-   if(!m_DllBcmHost || !m_nativeWindow)
-     return false;
- 
-+  while (proc_find("hello_video.bin") >= 0)
-+    Sleep(100);
-+
-   DestroyDispmaxWindow();
- 
-   RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
-
-From 3afc5e302cd1fdad4afa43ec705707f36de2ddaf Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 11 Dec 2014 17:00:57 +0000
-Subject: [PATCH 39/93] Fix for UI not showing both extractflags and
- extractthumb
-
----
- addons/resource.language.en_gb/resources/strings.po | 11 ++++++++---
- system/settings/settings.xml                        |  4 ++--
- 2 files changed, 10 insertions(+), 5 deletions(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index f9b8277..32314b2 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -11815,7 +11815,7 @@ msgstr ""
- 
- #: system/settings/settings.xml
- msgctxt "#20433"
--msgid "Extract thumbnails and video information"
-+msgid "Extract video information from files"
- msgstr ""
- 
- #: xbmc/dialogs/GUIDialogSmartPlaylistRule.cpp
-@@ -15987,7 +15987,7 @@ msgstr ""
- #. Description of setting "Videos -> File lists -> Extract thumbnails and video information" with label #20433
- #: system/settings/settings.xml
- msgctxt "#36178"
--msgid "Extract thumbnails and metadata information such as codec and aspect ratio from videos."
-+msgid "Extract metadata information such as codec and aspect ratio from videos."
- msgstr ""
- 
- #. Description of setting "Videos -> File lists -> Replace file names with library titles" with label #20419
-@@ -15999,7 +15999,7 @@ msgstr ""
- #. Description of setting "Videos -> File lists -> Extract thumbnails and video information" with label #20433
- #: system/settings/settings.xml
- msgctxt "#36180"
--msgid "Extract thumbnails and information, such as codecs and aspect ratio, to display in library mode."
-+msgid "Extract thumbnails, to display in library Mode."
- msgstr ""
- 
- #: system/settings/settings.xml
-@@ -18192,3 +18192,8 @@ msgstr ""
- msgctxt "#38052"
- msgid "Remote button press release time (ms)"
- msgstr ""
-+
-+#: system/settings/settings.xml
-+msgctxt "#38103"
-+msgid "Extract thumbnails from video files"
-+msgstr ""
-diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 918e8bf..61e1a22 100644
---- a/system/settings/settings.xml
-+++ b/system/settings/settings.xml
-@@ -885,8 +885,8 @@
-           <default>true</default>
-           <control type="toggle" />
-         </setting>
--        <setting id="myvideos.extractthumb" type="boolean" label="20433" help="36180">
--          <level>4</level>
-+        <setting id="myvideos.extractthumb" type="boolean" label="38103" help="36180">
-+          <level>1</level>
-           <default>true</default>
-           <control type="toggle" />
-         </setting>
-
-From c423d114818b5cd611bd83c31cda74139b5dfd91 Mon Sep 17 00:00:00 2001
-From: anaconda <anaconda@menakite.eu>
-Date: Thu, 11 Sep 2014 21:30:43 +0200
-Subject: [PATCH 40/93] Disable autoscrolling while on screensaver and while
- opening streams.
-
----
- xbmc/Application.cpp                | 10 ++++++++++
- xbmc/Application.h                  |  2 ++
- xbmc/guilib/GUIFadeLabelControl.cpp |  4 +++-
- xbmc/guilib/GUIFont.cpp             |  4 ++++
- xbmc/guilib/GUILabel.cpp            |  4 +++-
- xbmc/guilib/GUITextBox.cpp          |  3 ++-
- 6 files changed, 24 insertions(+), 3 deletions(-)
-
-diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index cb10ffa..c274e2f 100644
---- a/xbmc/Application.cpp
-+++ b/xbmc/Application.cpp
-@@ -5296,3 +5296,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
-   
-   return false;
- }
-+
-+bool CApplication::ScreenSaverDisablesAutoScrolling()
-+{
-+  bool onBlackDimScreenSaver = IsInScreenSaver() && m_screenSaver &&
-+    (m_screenSaver->ID() == "screensaver.xbmc.builtin.black" ||
-+     m_screenSaver->ID() == "screensaver.xbmc.builtin.dim");
-+  bool openingStreams = m_pPlayer->IsPlaying() && g_windowManager.IsWindowActive(WINDOW_DIALOG_BUSY);
-+
-+  return onBlackDimScreenSaver || openingStreams;
-+}
-diff --git a/xbmc/Application.h b/xbmc/Application.h
-index d7e5eee..a34ed98 100644
---- a/xbmc/Application.h
-+++ b/xbmc/Application.h
-@@ -390,6 +390,8 @@ public:
-    */
-   void UnregisterActionListener(IActionListener *listener);
- 
-+  bool ScreenSaverDisablesAutoScrolling();
-+
- protected:
-   virtual bool OnSettingsSaving() const override;
- 
-diff --git a/xbmc/guilib/GUIFadeLabelControl.cpp b/xbmc/guilib/GUIFadeLabelControl.cpp
-index ebd435e..97efc8a 100644
---- a/xbmc/guilib/GUIFadeLabelControl.cpp
-+++ b/xbmc/guilib/GUIFadeLabelControl.cpp
-@@ -20,6 +20,8 @@
- 
- #include "GUIFadeLabelControl.h"
- 
-+#include "Application.h"
-+
- CGUIFadeLabelControl::CGUIFadeLabelControl(int parentID, int controlID, float posX, float posY, float width, float height, const CLabelInfo& labelInfo, bool scrollOut, unsigned int timeToDelayAtEnd, bool resetOnLabelChange, bool randomized)
-     : CGUIControl(parentID, controlID, posX, posY, width, height), m_label(labelInfo), m_scrollInfo(50, labelInfo.offsetX, labelInfo.scrollSpeed)
-     , m_textLayout(labelInfo.font, false)
-@@ -105,7 +107,7 @@ void CGUIFadeLabelControl::Process(unsigned int currentTime, CDirtyRegionList &d
-     m_lastLabel = m_currentLabel;
-   }
- 
--  if (m_infoLabels.size() > 1 || !m_shortText)
-+  if ((m_infoLabels.size() > 1 || !m_shortText) && !g_application.ScreenSaverDisablesAutoScrolling())
-   { // have scrolling text
-     bool moveToNextLabel = false;
-     if (!m_scrollOut)
-diff --git a/xbmc/guilib/GUIFont.cpp b/xbmc/guilib/GUIFont.cpp
-index 7f11089..1192b74 100644
---- a/xbmc/guilib/GUIFont.cpp
-+++ b/xbmc/guilib/GUIFont.cpp
-@@ -22,6 +22,7 @@
- #include "GUIFontTTF.h"
- #include "GraphicContext.h"
- 
-+#include "Application.h"
- #include "threads/SingleLock.h"
- #include "utils/TimeUtils.h"
- #include "utils/MathUtils.h"
-@@ -128,6 +129,9 @@ bool CGUIFont::UpdateScrollInfo(const vecText &text, CScrollInfo &scrollInfo)
-   //   If the string is smaller than the viewport, then it may be plotted even
-   //   more times than that.
-   //
-+  if (g_application.ScreenSaverDisablesAutoScrolling())
-+    return false;
-+
-   if (scrollInfo.waitTime)
-   {
-     scrollInfo.waitTime--;
-diff --git a/xbmc/guilib/GUILabel.cpp b/xbmc/guilib/GUILabel.cpp
-index 759ac09..bed6ad2 100644
---- a/xbmc/guilib/GUILabel.cpp
-+++ b/xbmc/guilib/GUILabel.cpp
-@@ -21,6 +21,8 @@
- #include "GUILabel.h"
- #include <limits>
- 
-+#include "Application.h"
-+
- CGUILabel::CGUILabel(float posX, float posY, float width, float height, const CLabelInfo& labelInfo, CGUILabel::OVER_FLOW overflow)
-     : m_label(labelInfo)
-     , m_textLayout(labelInfo.font, overflow == OVER_FLOW_WRAP, height)
-@@ -104,7 +106,7 @@ void CGUILabel::Render()
-   color_t color = GetColor();
-   bool renderSolid = (m_color == COLOR_DISABLED);
-   bool overFlows = (m_renderRect.Width() + 0.5f < m_textLayout.GetTextWidth()); // 0.5f to deal with floating point rounding issues
--  if (overFlows && m_scrolling && !renderSolid)
-+  if (overFlows && m_scrolling && !renderSolid && !g_application.ScreenSaverDisablesAutoScrolling())
-     m_textLayout.RenderScrolling(m_renderRect.x1, m_renderRect.y1, m_label.angle, color, m_label.shadowColor, 0, m_renderRect.Width(), m_scrollInfo);
-   else
-   {
-diff --git a/xbmc/guilib/GUITextBox.cpp b/xbmc/guilib/GUITextBox.cpp
-index d7bc1c5..ac76629 100644
---- a/xbmc/guilib/GUITextBox.cpp
-+++ b/xbmc/guilib/GUITextBox.cpp
-@@ -24,6 +24,7 @@
- #include "utils/MathUtils.h"
- #include "utils/StringUtils.h"
- #include "guiinfo/GUIInfoLabels.h"
-+#include "Application.h"
- 
- #include <algorithm>
- 
-@@ -133,7 +134,7 @@ void CGUITextBox::Process(unsigned int currentTime, CDirtyRegionList &dirtyregio
-   // update our auto-scrolling as necessary
-   if (m_autoScrollTime && m_lines.size() > m_itemsPerPage)
-   {
--    if (!m_autoScrollCondition || m_autoScrollCondition->Get())
-+    if ((!m_autoScrollCondition || m_autoScrollCondition->Get()) && !g_application.ScreenSaverDisablesAutoScrolling())
-     {
-       if (m_lastRenderTime)
-         m_autoScrollDelayTime += currentTime - m_lastRenderTime;
-
-From 6b4fbcdd92b654b53fe8aeb5f00a5037117a505f Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 13 Dec 2014 18:35:20 +0000
-Subject: [PATCH 41/93] [demuxer] Avoid memcpy on every demuxer packet
-
-Avoids an unnecessary memcpy on every demuxer packet which for
-high bitrate videos can be significant.
----
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 17 ++++++++++++-----
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h   |  3 +++
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp  |  7 ++++++-
- 3 files changed, 21 insertions(+), 6 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-index 88d486b..47c15b9 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-@@ -753,7 +753,7 @@ DemuxPacket* CDVDDemuxFFmpeg::Read()
-           {
-             if(m_pkt.pkt.stream_index == (int)m_pFormatContext->programs[m_program]->stream_index[i])
-             {
--              pPacket = CDVDDemuxUtils::AllocateDemuxPacket(m_pkt.pkt.size);
-+              pPacket = CDVDDemuxUtils::AllocateDemuxPacket(0);
-               break;
-             }
-           }
-@@ -762,7 +762,7 @@ DemuxPacket* CDVDDemuxFFmpeg::Read()
-             bReturnEmpty = true;
-         }
-         else
--          pPacket = CDVDDemuxUtils::AllocateDemuxPacket(m_pkt.pkt.size);
-+          pPacket = CDVDDemuxUtils::AllocateDemuxPacket(0);
-       }
-       else
-         bReturnEmpty = true;
-@@ -804,9 +804,13 @@ DemuxPacket* CDVDDemuxFFmpeg::Read()
-         // copy contents into our own packet
-         pPacket->iSize = m_pkt.pkt.size;
- 
--        // maybe we can avoid a memcpy here by detecting where pkt.destruct is pointing too?
-         if (m_pkt.pkt.data)
--          memcpy(pPacket->pData, m_pkt.pkt.data, pPacket->iSize);
-+        {
-+          pPacket->pData = m_pkt.pkt.data;
-+          // so we can free AVPacket when DemuxPacket is freed
-+          pPacket->pkt = new AVPacket(m_pkt.pkt);
-+        }
-+
- 
-         pPacket->pts = ConvertTimestamp(m_pkt.pkt.pts, stream->time_base.den, stream->time_base.num);
-         pPacket->dts = ConvertTimestamp(m_pkt.pkt.dts, stream->time_base.den, stream->time_base.num);
-@@ -841,7 +845,10 @@ DemuxPacket* CDVDDemuxFFmpeg::Read()
-         pPacket->iStreamId = m_pkt.pkt.stream_index;
-       }
-       m_pkt.result = -1;
--      av_free_packet(&m_pkt.pkt);
-+      if (pPacket && pPacket->pkt)
-+        memset(&m_pkt.pkt, 0, sizeof(AVPacket));
-+      else
-+        av_free_packet(&m_pkt.pkt);
-     }
-   }
-   } // end of lock scope
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h
-index d64fbb3..012a7d1 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxPacket.h
-@@ -23,6 +23,8 @@
- #define DMX_SPECIALID_STREAMINFO    -10
- #define DMX_SPECIALID_STREAMCHANGE  -11
- 
-+struct AVPacket;
-+
-  typedef struct DemuxPacket
- {
-   unsigned char* pData;   // data
-@@ -33,4 +35,5 @@
-   double pts; // pts in DVD_TIME_BASE
-   double dts; // dts in DVD_TIME_BASE
-   double duration; // duration in DVD_TIME_BASE if available
-+  AVPacket *pkt; // to allow packet to be freed
- } DemuxPacket;
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp
-index ab298b2..10c5ee0 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxUtils.cpp
-@@ -34,7 +34,12 @@ void CDVDDemuxUtils::FreeDemuxPacket(DemuxPacket* pPacket)
-   if (pPacket)
-   {
-     try {
--      if (pPacket->pData) _aligned_free(pPacket->pData);
-+      if (pPacket->pkt)
-+      {
-+        av_free_packet(pPacket->pkt);
-+        delete pPacket->pkt;
-+      }
-+      else if (pPacket->pData) _aligned_free(pPacket->pData);
-       delete pPacket;
-     }
-     catch(...) {
-
-From 4e92f88d301118106a6aa08375bdd524fbbb0da8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 15 Feb 2015 14:06:12 +0000
-Subject: [PATCH 42/93] [mmal] Allow mmal codec for dvd stills
-
----
- xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp | 7 +++++++
- 1 file changed, 7 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
-index 84e9ef1..f920f49 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDFactoryCodec.cpp
-@@ -50,6 +50,9 @@
- #include "Video/DVDVideoCodecAndroidMediaCodec.h"
- #include "android/activity/AndroidFeatures.h"
- #endif
-+#if defined(HAS_MMAL)
-+#include "linux/RBP.h"
-+#endif
- #include "Audio/DVDAudioCodecFFmpeg.h"
- #include "Audio/DVDAudioCodecPassthrough.h"
- #include "Overlay/DVDOverlayCodecSSA.h"
-@@ -201,6 +204,10 @@ CDVDVideoCodec* CDVDFactoryCodec::CreateVideoCodec(CDVDStreamInfo &hint, const C
- #endif
-   CLog::Log(LOGDEBUG, "CDVDFactoryCodec: compiled in hardware support: %s", hwSupport.c_str());
- 
-+#if defined(HAS_MMAL)
-+  // mmal can handle dvd playback including stills
-+  if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || !g_RBP.GetCodecMpg2())
-+#endif
-   if (hint.stills && (hint.codec == AV_CODEC_ID_MPEG2VIDEO || hint.codec == AV_CODEC_ID_MPEG1VIDEO))
-   {
-      // If dvd is an mpeg2 and hint.stills
-
-From 6f7b1c2fa7e8b46895b2287b3a9361b85af7b210 Mon Sep 17 00:00:00 2001
-From: anaconda <anaconda@menakite.eu>
-Date: Wed, 25 Feb 2015 18:22:21 +0100
-Subject: [PATCH 43/93] Load OSD dialogs on startup.
-
-Fixes skipped frames the first time they're loaded in memory on less powered
-devices, like a Raspberry Pi, when using DVDPlayer.
-See http://forum.kodi.tv/showthread.php?tid=211501&pid=1938811#pid1938811
----
- xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp          | 1 +
- xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp             | 1 +
- xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp | 4 +++-
- xbmc/video/dialogs/GUIDialogSubtitles.cpp             | 2 +-
- xbmc/video/dialogs/GUIDialogVideoOSD.cpp              | 2 +-
- xbmc/video/dialogs/GUIDialogVideoSettings.cpp         | 4 +++-
- 6 files changed, 10 insertions(+), 4 deletions(-)
-
-diff --git a/xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp b/xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp
-index d7e8ac4..76b8c5a 100644
---- a/xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp
-+++ b/xbmc/pvr/dialogs/GUIDialogPVRChannelsOSD.cpp
-@@ -50,6 +50,7 @@ CGUIDialogPVRChannelsOSD::CGUIDialogPVRChannelsOSD() :
-     CGUIDialog(WINDOW_DIALOG_PVR_OSD_CHANNELS, "DialogPVRChannelsOSD.xml"),
-     Observer()
- {
-+  m_loadType = LOAD_ON_GUI_INIT;
-   m_vecItems = new CFileItemList;
- }
- 
-diff --git a/xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp b/xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp
-index 81dbc27..0462310 100644
---- a/xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp
-+++ b/xbmc/pvr/dialogs/GUIDialogPVRGuideOSD.cpp
-@@ -36,6 +36,7 @@ using namespace PVR;
- CGUIDialogPVRGuideOSD::CGUIDialogPVRGuideOSD()
-     : CGUIDialog(WINDOW_DIALOG_PVR_OSD_GUIDE, "DialogPVRGuideOSD.xml")
- {
-+  m_loadType = LOAD_ON_GUI_INIT;
-   m_vecItems = new CFileItemList;
- }
- 
-diff --git a/xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp b/xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp
-index 8d57767..60b6461 100644
---- a/xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp
-+++ b/xbmc/video/dialogs/GUIDialogAudioSubtitleSettings.cpp
-@@ -68,7 +68,9 @@ CGUIDialogAudioSubtitleSettings::CGUIDialogAudioSubtitleSettings()
-   : CGUIDialogSettingsManualBase(WINDOW_DIALOG_AUDIO_OSD_SETTINGS, "VideoOSDSettings.xml"),
-     m_passthrough(false),
-     m_dspEnabled(false)
--{ }
-+{
-+  m_loadType = LOAD_ON_GUI_INIT;
-+}
- 
- CGUIDialogAudioSubtitleSettings::~CGUIDialogAudioSubtitleSettings()
- { }
-diff --git a/xbmc/video/dialogs/GUIDialogSubtitles.cpp b/xbmc/video/dialogs/GUIDialogSubtitles.cpp
-index 3db982a..dd35664 100644
---- a/xbmc/video/dialogs/GUIDialogSubtitles.cpp
-+++ b/xbmc/video/dialogs/GUIDialogSubtitles.cpp
-@@ -103,7 +103,7 @@ CGUIDialogSubtitles::CGUIDialogSubtitles(void)
-     , m_pausedOnRun(false)
-     , m_updateSubsList(false)
- {
--  m_loadType = KEEP_IN_MEMORY;
-+  m_loadType  = LOAD_ON_GUI_INIT;
- }
- 
- CGUIDialogSubtitles::~CGUIDialogSubtitles(void)
-diff --git a/xbmc/video/dialogs/GUIDialogVideoOSD.cpp b/xbmc/video/dialogs/GUIDialogVideoOSD.cpp
-index c1e99cf..5e3a31b 100644
---- a/xbmc/video/dialogs/GUIDialogVideoOSD.cpp
-+++ b/xbmc/video/dialogs/GUIDialogVideoOSD.cpp
-@@ -30,7 +30,7 @@ using namespace PVR;
- CGUIDialogVideoOSD::CGUIDialogVideoOSD(void)
-     : CGUIDialog(WINDOW_DIALOG_VIDEO_OSD, "VideoOSD.xml")
- {
--  m_loadType = KEEP_IN_MEMORY;
-+  m_loadType = LOAD_ON_GUI_INIT;
- }
- 
- CGUIDialogVideoOSD::~CGUIDialogVideoOSD(void)
-diff --git a/xbmc/video/dialogs/GUIDialogVideoSettings.cpp b/xbmc/video/dialogs/GUIDialogVideoSettings.cpp
-index b248566..96c63cd 100644
---- a/xbmc/video/dialogs/GUIDialogVideoSettings.cpp
-+++ b/xbmc/video/dialogs/GUIDialogVideoSettings.cpp
-@@ -66,7 +66,9 @@
- CGUIDialogVideoSettings::CGUIDialogVideoSettings()
-     : CGUIDialogSettingsManualBase(WINDOW_DIALOG_VIDEO_OSD_SETTINGS, "VideoOSDSettings.xml"),
-       m_viewModeChanged(false)
--{ }
-+{
-+  m_loadType = LOAD_ON_GUI_INIT;
-+}
- 
- CGUIDialogVideoSettings::~CGUIDialogVideoSettings()
- { }
-
-From 881432f2448626f24ea06cf02a29c811b075cdc8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 7 Mar 2015 22:46:21 +0000
-Subject: [PATCH 44/93] configure: Add raspberry-pi2 platform
-
----
- configure.ac                                      | 14 +++++++--
- m4/xbmc_arch.m4                                   |  8 ++---
- tools/depends/Makefile.include.in                 |  2 +-
- tools/depends/configure.ac                        | 38 ++++++++++++++++-------
- tools/depends/target/Toolchain.cmake.in           |  2 +-
- tools/depends/target/Toolchain_binaddons.cmake.in |  2 +-
- 6 files changed, 44 insertions(+), 22 deletions(-)
-
-diff --git a/configure.ac b/configure.ac
-index 55e73b9..7a06a31 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -698,8 +698,17 @@ case $use_platform in
-   raspberry-pi)
-      target_platform=target_raspberry_pi
-      use_neon=no
--     use_arch="arm"
-      use_cpu=arm1176jzf-s
-+     ;;
-+  raspberry-pi2)
-+     target_platform=target_raspberry_pi
-+     use_neon=yes
-+     use_cpu=cortex-a7
-+     ;;
-+esac
-+
-+if test "$target_platform" = "target_raspberry_pi" ; then
-+     use_arch="arm"
-      use_hardcoded_tables="yes"
-      use_openmax=no
-      ARCH="arm"
-@@ -708,8 +717,7 @@ case $use_platform in
-      USE_MMAL=1; AC_DEFINE([HAS_MMAL],[1],["Define to 1 if MMAL libs is enabled"])
-      CFLAGS="$CFLAGS"
-      CXXFLAGS="$CXXFLAGS"
--     ;;
--esac
-+fi
- 
- if test "$host_vendor" = "apple"; then
-   use_avahi=no
-diff --git a/m4/xbmc_arch.m4 b/m4/xbmc_arch.m4
-index 0b66a82..adb8e97 100644
---- a/m4/xbmc_arch.m4
-+++ b/m4/xbmc_arch.m4
-@@ -77,9 +77,7 @@ if test "$target_platform" = "target_android" ; then
-   AC_SUBST(ARCH_DEFINES, "-DTARGET_POSIX -DTARGET_LINUX -D_LINUX -DTARGET_ANDROID")
- fi
- 
--case $use_platform in
--  raspberry-pi)
--     AC_SUBST(ARCH_DEFINES, "-DTARGET_POSIX -DTARGET_LINUX -D_LINUX -D_ARMEL -DTARGET_RASPBERRY_PI")
--     ;;
--esac
-+if test "$target_platform" = "target_raspberry_pi" ; then
-+  AC_SUBST(ARCH_DEFINES, "-DTARGET_POSIX -DTARGET_LINUX -D_LINUX -D_ARMEL -DTARGET_RASPBERRY_PI")
-+fi
- ])
-diff --git a/tools/depends/Makefile.include.in b/tools/depends/Makefile.include.in
-index 6e37022..326e7b8 100644
---- a/tools/depends/Makefile.include.in
-+++ b/tools/depends/Makefile.include.in
-@@ -20,7 +20,7 @@ NATIVE_OS=@build_os@
- CROSS_COMPILING=@cross_compiling@
- ARCH_DEFINES=@ARCH_DEFINES@
- NATIVE_ARCH_DEFINES=@NATIVE_ARCH_DEFINES@
--TARGET_PLATFORM=@use_platform@
-+TARGET_PLATFORM=@target_platform@
- XCODE_VERSION=@use_xcode@
- AAPT=@AAPT@
- DX=@DX@
-diff --git a/tools/depends/configure.ac b/tools/depends/configure.ac
-index 12935e3..478f5f0 100644
---- a/tools/depends/configure.ac
-+++ b/tools/depends/configure.ac
-@@ -17,7 +17,8 @@ AC_ARG_WITH([toolchain],
- AC_ARG_WITH([platform],
-   [AS_HELP_STRING([--with-platform],
-   [target platform [auto]])],
--  [use_platform=$withval])
-+  [use_platform=$withval],
-+  [target_platform=$withval])
- 
- AC_ARG_WITH([firmware],
-   [AS_HELP_STRING([--with-firmware],
-@@ -302,34 +303,49 @@ case $host in
-     AC_MSG_ERROR(unsupported host ($use_host))
- esac
- 
--if test "$use_platform" = "raspberry-pi"; then
-+case $use_platform in
-+  raspberry-pi)
-+     target_platform=raspberry_pi
-+     use_neon=no
-+     use_cpu=arm1176jzf-s
-+     platform_cflags="-mcpu=arm1176jzf-s -mtune=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp"
-+     platform_cxxflags="-mcpu=arm1176jzf-s -mtune=arm1176jzf-s -mfloat-abi=hard -mfpu=vfp"
-+     platform_ldflags=""
-+     ;;
-+  raspberry-pi2)
-+     target_platform=raspberry_pi
-+     use_neon=yes
-+     use_cpu=cortex-a7
-+     platform_cflags="-fPIC -mcpu=cortex-a7 -mfloat-abi=hard -mfpu=neon-vfpv4"
-+     platform_cxxflags="-fPIC -mcpu=cortex-a7 -mfloat-abi=hard -mfpu=neon-vfpv4"
-+     platform_ldflags="-lpthread"
-+     ;;
-+esac
-+
-+if test "$target_platform" = "raspberry_pi" ; then
-   if test -d "${use_firmware}/opt/vc/include"; then
-     :
-   else
-     AC_MSG_ERROR([Raspberry Pi firmware not found])
-   fi
--  use_neon=no
-   use_arch="arm"
--  use_cpu="arm1176jzf-s"
-   use_hardcoded_tables="yes"
--  use_alsa="no"
-   ARCH="arm"
-   platform_os="linux"
-   cross_compiling="yes"
-   use_host="arm-linux-gnueabihf"
-   deps_dir="$use_host"
--  platform_cflags="-pipe -mcpu=arm1176jzf-s -mtune=arm1176jzf-s -mfloat-abi=hard \
--   -mfpu=vfp -mabi=aapcs-linux -Wno-psabi -Wa,-mno-warn-deprecated \
--   -Wno-deprecated-declarations -isystem${use_firmware}/opt/vc/include \
-+  platform_cflags+=" -pipe -mabi=aapcs-linux -Wno-psabi \
-+   -Wa,-mno-warn-deprecated -Wno-deprecated-declarations \
-+   -isystem${use_firmware}/opt/vc/include \
-    -isystem${use_firmware}/opt/vc/include/interface/vcos/pthreads \
-    -isystem${use_firmware}/opt/vc/include/interface/vmcs_host/linux"
--  platform_cxxflags="-pipe -mcpu=arm1176jzf-s -mtune=arm1176jzf-s \
--   -mfloat-abi=hard -mfpu=vfp -mabi=aapcs-linux -Wno-psabi \
-+  platform_cxxflags+=" -pipe -mabi=aapcs-linux -Wno-psabi \
-    -Wa,-mno-warn-deprecated -Wno-deprecated-declarations \
-    -isystem${use_firmware}/opt/vc/include \
-    -isystem${use_firmware}/opt/vc/include/interface/vcos/pthreads \
-    -isystem${use_firmware}/opt/vc/include/interface/vmcs_host/linux"
--  platform_ldflags="-L${use_firmware}/opt/vc/lib -lEGL -lGLESv2 -lbcm_host -lvcos \
-+  platform_ldflags+=" -L${use_firmware}/opt/vc/lib -lEGL -lGLESv2 -lbcm_host -lvcos \
-    -lvchiq_arm"
- fi
- 
-diff --git a/tools/depends/target/Toolchain.cmake.in b/tools/depends/target/Toolchain.cmake.in
-index 943be73..59385e8 100644
---- a/tools/depends/target/Toolchain.cmake.in
-+++ b/tools/depends/target/Toolchain.cmake.in
-@@ -1,6 +1,6 @@
- SET(OS "@platform_os@")
- SET(CPU "@use_cpu@")
--SET(PLATFORM "@use_platform@")
-+SET(PLATFORM "@target_platform@")
- IF("${OS}" STREQUAL "linux" OR "${OS}" STREQUAL "android")
- SET(CMAKE_SYSTEM_NAME Linux)
- ENDIF()
-diff --git a/tools/depends/target/Toolchain_binaddons.cmake.in b/tools/depends/target/Toolchain_binaddons.cmake.in
-index dc6d565..98494b4 100644
---- a/tools/depends/target/Toolchain_binaddons.cmake.in
-+++ b/tools/depends/target/Toolchain_binaddons.cmake.in
-@@ -1,7 +1,7 @@
- set(CMAKE_SYSTEM_VERSION 1)
- set(OS "@platform_os@")
- set(CPU "@use_cpu@")
--set(PLATFORM "@use_platform@")
-+set(PLATFORM "@target_platform@")
- if("${OS}" STREQUAL "linux" OR "${OS}" STREQUAL "android")
-   set(CMAKE_SYSTEM_NAME Linux)
- endif()
-
-From 555c3d2ed48c00e6ef8632d47db58cab4d53b78b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 14 Apr 2015 20:51:14 +0100
-Subject: [PATCH 45/93] [gui] Also limit GUI updates when in non full-screen
- video mode
-
----
- xbmc/Application.cpp | 4 +++-
- 1 file changed, 3 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index c274e2f..212a5c7 100644
---- a/xbmc/Application.cpp
-+++ b/xbmc/Application.cpp
-@@ -2834,7 +2834,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
- #if defined(TARGET_RASPBERRY_PI) || defined(HAS_IMXVPU)
-     // This code reduces rendering fps of the GUI layer when playing videos in fullscreen mode
-     // it makes only sense on architectures with multiple layers
--    if (g_graphicsContext.IsFullScreenVideo() && !m_pPlayer->IsPausedPlayback() && g_renderManager.IsVideoLayer())
-+    if (m_pPlayer->IsPlayingVideo() && !m_pPlayer->IsPausedPlayback() && g_renderManager.IsVideoLayer())
-       fps = CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_LIMITGUIUPDATE);
- #endif
- 
-@@ -2847,6 +2847,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
-     {
-       if (!m_skipGuiRender)
-         g_windowManager.Process(CTimeUtils::GetFrameTime());
-+      else if (!g_graphicsContext.IsFullScreenVideo())
-+        g_renderManager.FrameMove();
-     }
-     g_windowManager.FrameMove();
-   }
-
-From 67b90947ab8fb7fe16d39597f285a7e08fabc5b8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 21 Apr 2015 14:32:07 +0100
-Subject: [PATCH 46/93] [mmalrenderer] Add sharpness control
-
----
- addons/resource.language.en_gb/resources/strings.po |  2 +-
- xbmc/cores/VideoRenderers/MMALRenderer.cpp          | 13 ++++++++++++-
- xbmc/cores/VideoRenderers/MMALRenderer.h            |  1 +
- 3 files changed, 14 insertions(+), 2 deletions(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 32314b2..a1da64b 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -8229,7 +8229,7 @@ msgstr ""
- 
- #: xbmc/video/dialogs/GUIDialogVideoSettings.cpp
- msgctxt "#16313"
--msgid "VDPAU - Sharpness"
-+msgid "Sharpness"
- msgstr ""
- 
- #: xbmc/video/dialogs/GUIDialogVideoSettings.cpp
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index 2dff194..adf6f73 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -252,6 +252,7 @@ CMMALRenderer::CMMALRenderer()
-   m_bMMALConfigured = false;
-   m_iYV12RenderBuffer = 0;
-   m_inflight = 0;
-+  m_sharpness = -2.0f;
- }
- 
- CMMALRenderer::~CMMALRenderer()
-@@ -459,6 +460,15 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-   if (m_RenderUpdateCallBackFn)
-     (*m_RenderUpdateCallBackFn)(m_RenderUpdateCallBackCtx, m_sourceRect, m_destRect);
- 
-+  // if sharpness setting has changed, we should update it
-+  if (m_sharpness != CMediaSettings::GetInstance().GetCurrentVideoSettings().m_Sharpness)
-+  {
-+    m_sharpness = CMediaSettings::GetInstance().GetCurrentVideoSettings().m_Sharpness;
-+    char command[80], response[80];
-+    sprintf(command, "scaling_sharpness %d", ((int)(50.0f * (m_sharpness + 1.0f) + 0.5f)));
-+    vc_gencmd(response, sizeof response, command);
-+  }
-+
-   if (m_format == RENDER_FMT_BYPASS)
-   {
- #if defined(MMAL_DEBUG_VERBOSE)
-@@ -669,7 +679,8 @@ bool CMMALRenderer::Supports(ERENDERFEATURE feature)
-       feature == RENDERFEATURE_ZOOM            ||
-       feature == RENDERFEATURE_ROTATION        ||
-       feature == RENDERFEATURE_VERTICAL_SHIFT  ||
--      feature == RENDERFEATURE_PIXEL_RATIO)
-+      feature == RENDERFEATURE_PIXEL_RATIO     ||
-+      feature == RENDERFEATURE_SHARPNESS)
-     return true;
- 
-   return false;
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.h b/xbmc/cores/VideoRenderers/MMALRenderer.h
-index d3e5129..a71e645 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.h
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.h
-@@ -119,6 +119,7 @@ protected:
-   RENDER_STEREO_MODE        m_display_stereo_mode;
-   bool                      m_StereoInvert;
-   int                       m_inflight;
-+  float                     m_sharpness;
- 
-   CCriticalSection m_sharedSection;
-   MMAL_COMPONENT_T *m_vout;
-
-From 2ce900e2ef03fae1215700b5a839276585a00c92 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 24 Apr 2015 13:49:51 +0100
-Subject: [PATCH 47/93] [dvdplayer] Add back required include
-
----
- xbmc/cores/dvdplayer/DVDPlayerVideo.cpp | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp b/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-index b5777a1..64b4d60 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayerVideo.cpp
-@@ -33,6 +33,7 @@
- #include "DVDCodecs/Video/DVDVideoPPFFmpeg.h"
- #include "DVDCodecs/Video/DVDVideoCodecFFmpeg.h"
- #include "DVDDemuxers/DVDDemux.h"
-+#include "DVDDemuxers/DVDDemuxUtils.h"
- #include "DVDOverlayRenderer.h"
- #include "guilib/GraphicContext.h"
- #include <sstream>
-
-From dc5e83b0cfbec04a34b3b8ea7fca8bbbcaae1f2c Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 5 May 2015 23:58:06 +0100
-Subject: [PATCH 48/93] [screensaver] Leave GUI contents available for
- screensaver
-
----
- xbmc/guilib/GUIWindowManager.cpp | 11 ++++++++++-
- 1 file changed, 10 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/guilib/GUIWindowManager.cpp b/xbmc/guilib/GUIWindowManager.cpp
-index 89cfb8b..767c7b6 100644
---- a/xbmc/guilib/GUIWindowManager.cpp
-+++ b/xbmc/guilib/GUIWindowManager.cpp
-@@ -786,7 +786,16 @@ void CGUIWindowManager::ActivateWindow_Internal(int iWindowID, const std::vector
-   int currentWindow = GetActiveWindow();
-   CGUIWindow *pWindow = GetWindow(currentWindow);
-   if (pWindow)
--    CloseWindowSync(pWindow, iWindowID);
-+  {
-+    if (iWindowID == WINDOW_SCREENSAVER)
-+    {
-+      pWindow->Close(true, iWindowID);
-+    }
-+    else
-+    {
-+      CloseWindowSync(pWindow, iWindowID);
-+    }
-+  }
-   g_infoManager.SetNextWindow(WINDOW_INVALID);
- 
-   // Add window to the history list (we must do this before we activate it,
-
-From 249d8d5147b3124129255deaa216da316cb8732e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 6 Jun 2015 18:43:57 +0100
-Subject: [PATCH 49/93] ffmpeg: Automatic switch to software decode for GMC
- with more than one warp point
-
----
- ...Signal-unsupported-GMC-with-more-than-one.patch | 48 ++++++++++++++++++++++
- tools/depends/target/ffmpeg/Makefile               |  4 +-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp |  6 +++
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h        |  2 +
- .../cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp |  2 +-
- xbmc/cores/dvdplayer/DVDStreamInfo.cpp             |  3 ++
- xbmc/cores/dvdplayer/DVDStreamInfo.h               |  1 +
- xbmc/cores/omxplayer/OMXHelper.cpp                 |  8 +++-
- 8 files changed, 71 insertions(+), 3 deletions(-)
- create mode 100644 tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-
-diff --git a/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-new file mode 100644
-index 0000000..4cb8dd8
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-@@ -0,0 +1,48 @@
-+From 84e9a1784bbd3182b68cefa5e5feae8da8b9e184 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Fri, 5 Jun 2015 22:48:33 +0100
-+Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp
-+ point
-+
-+---
-+ libavcodec/avcodec.h       | 1 +
-+ libavcodec/mpeg4videodec.c | 4 ++++
-+ 2 files changed, 5 insertions(+)
-+
-+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-+index 8c7c420..e63dc2d 100644
-+--- a/libavcodec/avcodec.h
-++++ b/libavcodec/avcodec.h
-+@@ -2527,6 +2527,7 @@ typedef struct AVCodecContext {
-+ #define FF_BUG_DC_CLIP          4096
-+ #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
-+ #define FF_BUG_TRUNCATED       16384
-++#define FF_BUG_GMC_UNSUPPORTED 32768
-+ 
-+     /**
-+      * strictly follow the standard (MPEG4, ...).
-+diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-+index 9bf33dd..0b5d3b9 100644
-+--- a/libavcodec/mpeg4videodec.c
-++++ b/libavcodec/mpeg4videodec.c
-+@@ -2179,6 +2179,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-+ 
-+         if (ctx->divx_version >= 0)
-+             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
-++
-++        if (ctx->num_sprite_warping_points > 1)
-++            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
-+     }
-+ 
-+     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-+@@ -2203,6 +2206,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-+                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-+                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
-+ 
-++    avctx->workaround_bugs = s->workaround_bugs;
-+     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
-+         s->codec_id == AV_CODEC_ID_MPEG4 &&
-+         avctx->idct_algo == FF_IDCT_AUTO) {
-+-- 
-+1.9.1
-+
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index fcfc553..6a9f105 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -1,6 +1,7 @@
- include ../../Makefile.include
- include FFMPEG-VERSION
--DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch
-+DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch \
-+  0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -71,6 +72,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
- 	cd $(PLATFORM); sed -i".bak" -e "s%pkg_config_default=pkg-config%export PKG_CONFIG_LIBDIR=$(PREFIX)/lib/pkgconfig \&\& pkg_config_default=$(NATIVEPREFIX)/bin/pkg-config%" configure
- 	cd $(PLATFORM); patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
-+	cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
- 	./configure $(ffmpg_config)
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index c09074d..3345685 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -46,6 +46,10 @@
- 
- #include "linux/RBP.h"
- 
-+#ifndef FF_BUG_GMC_UNSUPPORTED
-+#define FF_BUG_GMC_UNSUPPORTED 0
-+#endif
-+
- using namespace KODI::MESSAGING;
- 
- #define CLASSNAME "CMMALVideoBuffer"
-@@ -531,6 +535,8 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   // we always qualify even if DVDFactoryCodec does this too.
-   if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
-     return false;
-+  if (hints.workaround_bugs & FF_BUG_GMC_UNSUPPORTED)
-+    return false;
- 
-   m_hints = hints;
-   m_vout_input_pool = (MMAL_POOL_T *)options.m_opaque_pointer;
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-index e380056..122e539 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-@@ -160,6 +160,7 @@ public:
-     type = STREAM_VIDEO;
-     iOrientation = 0;
-     iBitsPerPixel = 0;
-+    workaround_bugs = 0;
-   }
- 
-   virtual ~CDemuxStreamVideo() {}
-@@ -176,6 +177,7 @@ public:
-   int iOrientation; // orientation of the video in degress counter clockwise
-   int iBitsPerPixel;
-   std::string stereo_mode; // expected stereo mode
-+  int workaround_bugs; // info for decoder
- };
- 
- class CDemuxStreamAudio : public CDemuxStream
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-index 47c15b9..56dcbfb 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-@@ -1215,7 +1215,7 @@ CDemuxStream* CDVDDemuxFFmpeg::AddStream(int iId)
-         if (!stereoMode.empty())
-           st->stereo_mode = stereoMode;
- 
--        
-+        st->workaround_bugs = pStream->codec->workaround_bugs;
-         if ( m_pInput->IsStreamType(DVDSTREAM_TYPE_DVD) )
-         {
-           if (pStream->codec->codec_id == AV_CODEC_ID_PROBE)
-diff --git a/xbmc/cores/dvdplayer/DVDStreamInfo.cpp b/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-index fa0defa..37c2d16 100644
---- a/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-+++ b/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-@@ -73,6 +73,7 @@ void CDVDStreamInfo::Clear()
-   bitspersample = 0;
- 
-   orientation = 0;
-+  workaround_bugs = 0;
- }
- 
- bool CDVDStreamInfo::Equal(const CDVDStreamInfo& right, bool withextradata)
-@@ -174,6 +175,7 @@ void CDVDStreamInfo::Assign(const CDVDStreamInfo& right, bool withextradata)
-   vfr = right.vfr;
-   software = right.software;
-   stereo_mode = right.stereo_mode;
-+  workaround_bugs = right.workaround_bugs;
- 
-   // AUDIO
-   channels      = right.channels;
-@@ -231,6 +233,7 @@ void CDVDStreamInfo::Assign(const CDemuxStream& right, bool withextradata)
-     bitsperpixel = stream->iBitsPerPixel;
-     pid = stream->iPhysicalId;
-     stereo_mode = stream->stereo_mode;
-+    workaround_bugs = stream->workaround_bugs;
-   }
-   else if(  right.type == STREAM_SUBTITLE )
-   {
-diff --git a/xbmc/cores/dvdplayer/DVDStreamInfo.h b/xbmc/cores/dvdplayer/DVDStreamInfo.h
-index c0e22a2..3849993 100644
---- a/xbmc/cores/dvdplayer/DVDStreamInfo.h
-+++ b/xbmc/cores/dvdplayer/DVDStreamInfo.h
-@@ -73,6 +73,7 @@ public:
-   int bitsperpixel;
-   int pid;
-   std::string stereo_mode; // stereoscopic 3d mode
-+  int workaround_bugs; // info for decoder
- 
-   // AUDIO
-   int channels;
-diff --git a/xbmc/cores/omxplayer/OMXHelper.cpp b/xbmc/cores/omxplayer/OMXHelper.cpp
-index 7251fc1..3429cea 100644
---- a/xbmc/cores/omxplayer/OMXHelper.cpp
-+++ b/xbmc/cores/omxplayer/OMXHelper.cpp
-@@ -29,6 +29,10 @@
- #include "cores/omxplayer/OMXPlayerAudio.h"
- #include "cores/omxplayer/OMXPlayerVideo.h"
- 
-+#ifndef FF_BUG_GMC_UNSUPPORTED
-+#define FF_BUG_GMC_UNSUPPORTED 0
-+#endif
-+
- #define PREDICATE_RETURN(lh, rh) \
-   do { \
-     if((lh) != (rh)) \
-@@ -80,7 +84,9 @@ bool OMXPlayerUnsuitable(bool m_HasVideo, bool m_HasAudio, CDVDDemux* m_pDemuxer
-       CDVDStreamInfo hint(*stream, true);
- 
-       bool supported = false;
--      if ((hint.codec == AV_CODEC_ID_MPEG1VIDEO || hint.codec == AV_CODEC_ID_MPEG2VIDEO) && g_RBP.GetCodecMpg2())
-+      if (hint.workaround_bugs & FF_BUG_GMC_UNSUPPORTED)
-+        ;
-+      else if ((hint.codec == AV_CODEC_ID_MPEG1VIDEO || hint.codec == AV_CODEC_ID_MPEG2VIDEO) && g_RBP.GetCodecMpg2())
-         supported = true;
-       else if ((hint.codec == AV_CODEC_ID_VC1 || hint.codec == AV_CODEC_ID_WMV3) && g_RBP.GetCodecWvc1())
-         supported = true;
-
-From eb80abbf4ad8994a28d58ea8494e8a7bcd48b2f3 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 22 Jun 2015 21:46:57 +0100
-Subject: [PATCH 50/93] [rbp] Use default resampling setting on Pi2
-
----
- system/settings/rbp2.xml | 5 +++++
- 1 file changed, 5 insertions(+)
-
-diff --git a/system/settings/rbp2.xml b/system/settings/rbp2.xml
-index 8cc8f19..52778ec 100644
---- a/system/settings/rbp2.xml
-+++ b/system/settings/rbp2.xml
-@@ -23,6 +23,11 @@
-         <setting id="audiooutput.ac3transcode" help="36429">
-         </setting>
-       </group>
-+      <group id="1">
-+        <setting id="audiooutput.processquality">
-+          <default>30</default> <!-- AE_QUALITY_MID -->
-+        </setting>
-+      </group>
-     </category>
-   </section>
- </settings>
-
-From 822ce9d64325082d7b071b68331c8fbd406d2ee1 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 10 Mar 2016 17:54:59 +0000
-Subject: [PATCH 51/93] rbp: Expose gpu memory allocation functions
-
----
- xbmc/linux/RBP.cpp         | 116 ++++++++++++
- xbmc/linux/RBP.h           |  16 ++
- xbmc/linux/rpi_user_vcsm.h | 460 +++++++++++++++++++++++++++++++++++++++++++++
- 3 files changed, 592 insertions(+)
- create mode 100644 xbmc/linux/rpi_user_vcsm.h
-
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index a79d6d9..257c238 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -28,6 +28,17 @@
- 
- #include "cores/omxplayer/OMXImage.h"
- 
-+#include <sys/ioctl.h>
-+#include <linux/ioctl.h>
-+#include "rpi_user_vcsm.h"
-+
-+#define MAJOR_NUM 100
-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+#define DEVICE_FILE_NAME "/dev/vcio"
-+
-+static int mbox_open();
-+static void mbox_close(int file_desc);
-+
- CRBP::CRBP()
- {
-   m_initialized     = false;
-@@ -36,6 +47,8 @@ CRBP::CRBP()
-   m_OMX             = new COMXCore();
-   m_display = DISPMANX_NO_HANDLE;
-   m_last_pll_adjust = 1.0;
-+  m_mb = mbox_open();
-+  vcsm_init();
- }
- 
- CRBP::~CRBP()
-@@ -225,6 +238,10 @@ void CRBP::Deinitialize()
-   m_omx_image_init  = false;
-   m_initialized     = false;
-   m_omx_initialized = false;
-+  if (m_mb)
-+    mbox_close(m_mb);
-+  m_mb = 0;
-+  vcsm_exit();
- }
- 
- double CRBP::AdjustHDMIClock(double adjust)
-@@ -238,4 +255,103 @@ double CRBP::AdjustHDMIClock(double adjust)
-   return m_last_pll_adjust;
- }
- 
-+static int mbox_property(int file_desc, void *buf)
-+{
-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-+
-+   if (ret_val < 0)
-+   {
-+     CLog::Log(LOGERROR, "%s: ioctl_set_msg failed:%d", __FUNCTION__, ret_val);
-+   }
-+   return ret_val;
-+}
-+
-+static int mbox_open()
-+{
-+   int file_desc;
-+
-+   // open a char device file used for communicating with kernel mbox driver
-+   file_desc = open(DEVICE_FILE_NAME, 0);
-+   if (file_desc < 0)
-+   {
-+     CLog::Log(LOGERROR, "%s: Can't open device file: %s (%d)", __FUNCTION__, DEVICE_FILE_NAME, file_desc);
-+     CLog::Log(LOGERROR, "Try creating a device file with: sudo mknod %s c %d 0", __FUNCTION__, DEVICE_FILE_NAME, MAJOR_NUM);
-+   }
-+   return file_desc;
-+}
-+
-+static void mbox_close(int file_desc)
-+{
-+  close(file_desc);
-+}
-+
-+static unsigned mem_lock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000d; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_unlock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000e; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+CGPUMEM::CGPUMEM(unsigned int numbytes, bool cached)
-+{
-+  m_numbytes = numbytes;
-+  m_vcsm_handle = vcsm_malloc_cache(numbytes, cached ? VCSM_CACHE_TYPE_HOST : VCSM_CACHE_TYPE_NONE, (char *)"CGPUMEM");
-+  assert(m_vcsm_handle);
-+  m_vc_handle = vcsm_vc_hdl_from_hdl(m_vcsm_handle);
-+  assert(m_vc_handle);
-+  m_arm = vcsm_lock(m_vcsm_handle);
-+  assert(m_arm);
-+  m_vc = mem_lock(g_RBP.GetMBox(), m_vc_handle);
-+  assert(m_vc);
-+}
-+
-+CGPUMEM::~CGPUMEM()
-+{
-+  mem_unlock(g_RBP.GetMBox(), m_vc_handle);
-+  vcsm_unlock_ptr(m_arm);
-+  vcsm_free(m_vcsm_handle);
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void CGPUMEM::Flush()
-+{
-+  struct vcsm_user_clean_invalid_s iocache = {};
-+  iocache.s[0].handle = m_vcsm_handle;
-+  iocache.s[0].cmd = 3; // clean+invalidate
-+  iocache.s[0].addr = (int) m_arm;
-+  iocache.s[0].size  = m_numbytes;
-+  vcsm_clean_invalid( &iocache );
-+}
-+
- #endif
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index db2fade..ab24bbe 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -41,6 +41,20 @@
- #include "threads/CriticalSection.h"
- #include "threads/Event.h"
- 
-+class CGPUMEM
-+{
-+public:
-+  CGPUMEM(unsigned int numbytes, bool cached = true);
-+  ~CGPUMEM();
-+  void Flush();
-+  void *m_arm; // Pointer to memory mapped on ARM side
-+  int m_vc_handle;   // Videocore handle of relocatable memory
-+  int m_vcsm_handle; // Handle for use by VCSM
-+  unsigned int m_vc;       // Address for use in GPU code
-+  unsigned int m_numbytes; // Size of memory block
-+  void *m_opaque;
-+};
-+
- class CRBP
- {
- public:
-@@ -66,6 +80,7 @@ public:
-   void WaitVsync();
-   double AdjustHDMIClock(double adjust);
-   double GetAdjustHDMIClock() { return m_last_pll_adjust; }
-+  int GetMBox() { return m_mb; }
- 
- private:
-   DllBcmHost *m_DllBcmHost;
-@@ -83,6 +98,7 @@ private:
-   class DllLibOMXCore;
-   CCriticalSection m_critSection;
-   double m_last_pll_adjust;
-+  int m_mb;
- };
- 
- extern CRBP g_RBP;
-diff --git a/xbmc/linux/rpi_user_vcsm.h b/xbmc/linux/rpi_user_vcsm.h
-new file mode 100644
-index 0000000..94e6e79
---- /dev/null
-+++ b/xbmc/linux/rpi_user_vcsm.h
-@@ -0,0 +1,460 @@
-+/*****************************************************************************
-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
-+*
-+* This program is the proprietary software of Broadcom Corporation and/or
-+* its licensors, and may only be used, duplicated, modified or distributed
-+* pursuant to the terms and conditions of a separate, written license
-+* agreement executed between you and Broadcom (an "Authorized License").
-+* Except as set forth in an Authorized License, Broadcom grants no license
-+* (express or implied), right to use, or waiver of any kind with respect to
-+* the Software, and Broadcom expressly reserves all rights in and to the
-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-+* THE SOFTWARE.
-+*
-+* Except as expressly set forth in the Authorized License,
-+* 1. This program, including its structure, sequence and organization,
-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
-+*    all reasonable efforts to protect the confidentiality thereof, and to
-+*    use this information only in connection with your use of Broadcom
-+*    integrated circuit products.
-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-+*****************************************************************************/
-+
-+#ifndef __USER_VCSM__H__INCLUDED__
-+#define __USER_VCSM__H__INCLUDED__
-+
-+/* VideoCore Shared Memory - user interface library.
-+**
-+** This library provides all the necessary abstraction for any application to
-+** make use of the shared memory service which is distributed accross a kernel
-+** driver and a videocore service.
-+**
-+** It is an application design decision to choose or not to use this service.
-+**
-+** The logical flow of operations that a user application needs to follow when
-+** using this service is:
-+**
-+**       1) Initialize the service.
-+**       2) Allocate shared memory blocks.
-+**       3) Start using the allocated blocks.
-+**          - In order to gain ownership on a block, lock the allocated block,
-+**            locking a block returns a valid address that the user application
-+**            can access.
-+**          - When finished with using the block for the current execution cycle
-+**            or function, and so when giving up the ownership, unlock the block.
-+**       4) A block can be locked/unlocked as many times required - within or outside
-+**          of - a specific execution context.
-+**       5) To completely release an allocated block, free it.
-+**       6) If the service is no longer required, terminate it.
-+**
-+**
-+** Some generic considerations:
-+
-+** Allocating memory blocks.
-+**
-+**   Memory blocks can be allocated in different manners depending on the cache
-+**   behavior desired.  A given block can either be:
-+
-+**       - Allocated in a non cached fashion all the way through host and videocore.
-+**       - Allocated in a cached fashion on host OR videocore.
-+**       - Allocated in a cached fashion on host AND videocore.
-+**
-+**   It is an application decision to determine how to allocate a block.  Evidently
-+**   if the application will be doing substantial read/write accesses to a given block,
-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
-+**   better results.
-+**
-+**
-+** Locking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, locking the
-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**   It is possible to dynamically change the host cache behavior (ie cached or non
-+**   cached) of a given allocation without needing to free and re-allocate the block.
-+**   This feature can be useful for such application which requires access to the block
-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
-+**   the application can optimize performances for a given duration of use.
-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
-+**   cache.  If one requires to change the videocore cache behavior, then a new block
-+**   must be created to replace the old one.
-+**
-+**   On successful locking, a valid pointer is returned that the application can use
-+**   to access to data inside the block.  There is no guarantee that the pointer will
-+**   stay valid following the unlock action corresponding to this lock.
-+**
-+**
-+** Unocking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, unlocking the
-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
-+**   explicitely asked not to flush the cache for performances reasons.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**
-+** A complete API is defined below.
-+*/
-+
-+#ifdef __cplusplus
-+extern "C"
-+{
-+#endif
-+
-+/* Different status that can be dumped.
-+*/
-+typedef enum
-+{
-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
-+                                    // Result of the walk is seen in the videocore
-+                                    // log.
-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
-+                                    // driver (ie for all processes).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-+                                    // VCSM_STATUS_HOST_WALK_MAP.
-+                                    //
-+   VCSM_STATUS_NONE,                // Must be last - invalid.
-+
-+} VCSM_STATUS_T;
-+
-+/* Different kind of cache behavior.
-+*/
-+typedef enum
-+{
-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
-+
-+} VCSM_CACHE_TYPE_T;
-+
-+/* Initialize the vcsm processing.
-+**
-+** Must be called once before attempting to do anything else.
-+**
-+** Returns 0 on success, -1 on error.
-+*/
-+int vcsm_init( void );
-+
-+
-+/* Terminates the vcsm processing.
-+**
-+** Must be called vcsm services are no longer needed, it will
-+** take care of removing any allocation under the current process
-+** control if deemed necessary.
-+*/
-+void vcsm_exit( void );
-+
-+
-+/* Queries the status of the the vcsm.
-+**
-+** Triggers dump of various kind of information, see the
-+** different variants specified in VCSM_STATUS_T.
-+**
-+** Pid is optional.
-+*/
-+void vcsm_status( VCSM_STATUS_T status, int pid );
-+
-+
-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-+** allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+** 
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc( unsigned int size, char *name );
-+
-+
-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
-+** allocator, the type of caching requested is passed as argument of the
-+** function call.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+** 
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-+
-+
-+/* Shares an allocated block of memory via the vcsm memory allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_share( unsigned int handle );
-+
-+
-+/* Resizes a block of memory allocated previously by vcsm_alloc.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** The handle must be unlocked by user prior to attempting any
-+** resize action.
-+**
-+** On error, the original size allocated against the handle
-+** remains available the same way it would be following a
-+** successful vcsm_malloc.
-+*/
-+int vcsm_resize( unsigned int handle, unsigned int new_size );
-+
-+
-+/* Frees a block of memory that was successfully allocated by
-+** a prior call the vcms_alloc.
-+**
-+** The handle should be considered invalid upon return from this
-+** call.
-+**
-+** Whether any memory is actually freed up or not as the result of
-+** this call will depends on many factors, if all goes well it will
-+** be freed.  If something goes wrong, the memory will likely end up
-+** being freed up as part of the vcsm_exit process.  In the end the
-+** memory is guaranteed to be freed one way or another.
-+*/
-+void vcsm_free( unsigned int handle );
-+
-+
-+/* Retrieves a videocore opaque handle from a mapped user address
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-+
-+
-+/* Retrieves a videocore opaque handle from a opaque handle
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
-+
-+
-+/* Retrieves a user opaque handle from a mapped user address
-+** pointer.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+*/
-+unsigned int vcsm_usr_handle( void *usr_ptr );
-+
-+
-+/* Retrieves a mapped user address from an opaque user
-+** handle.
-+**
-+** Returns:        0 on error
-+**                 a non-zero address on success.
-+**
-+** On success, the address corresponds to the pointer
-+** which can access the data allocated via the vcsm_malloc
-+** call.
-+*/
-+void *vcsm_usr_address( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.  The lock
-+** also gives a chance to update the *host* cache behavior of the
-+** allocated buffer if so desired.  The *videocore* cache behavior
-+** of the allocated buffer cannot be changed by this call and such
-+** attempt will be ignored.
-+**
-+** The system will attempt to honour the cache_update mode request,
-+** the cache_result mode will provide the final answer on which cache
-+** mode is really in use.  Failing to change the cache mode will not
-+** result in a failure to lock the buffer as it is an application
-+** decision to choose what to do if (cache_result != cache_update)
-+**
-+** The value returned in cache_result can only be considered valid if
-+** the returned pointer is non NULL.  The cache_result pointer may be
-+** NULL if the application does not care about the actual outcome of
-+** its action with regards to the cache behavior change.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock_cache( unsigned int handle,
-+                       VCSM_CACHE_TYPE_T cache_update,
-+                       VCSM_CACHE_TYPE_T *cache_result );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr( void *usr_ptr );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl( unsigned int handle );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+
-+/* Clean and/or invalidate the memory associated with this user opaque handle
-+**
-+** Returns:        non-zero on error
-+**
-+** structure contains a list of flush/invalidate commands. Commands are:
-+** 0: nop
-+** 1: invalidate       given virtual range in L1/L2
-+** 2: clean            given virtual range in L1/L2
-+** 3: clean+invalidate given virtual range in L1/L2
-+** 4: flush all L1/L2
-+*/
-+struct vcsm_user_clean_invalid_s {
-+   struct {
-+      unsigned int cmd;
-+      unsigned int handle;
-+      unsigned int addr;
-+      unsigned int size;
-+   } s[8];
-+};
-+
-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif /* __USER_VCSM__H__INCLUDED__ */
-+
-
-From 97b436ca545f9a2faad6fdf02a9668843bd3f324 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 10 Mar 2016 17:56:11 +0000
-Subject: [PATCH 52/93] [rbp] HW mouse pointer
-
-Updating the mouse point provokes a complete screen update which can make it feel laggy
-and results in high cpu.
-
-Render the mouse with an overlay to avoid redrawing the normal gui.
----
- xbmc/guilib/GUIWindowManager.cpp  |   2 +
- xbmc/linux/RBP.cpp                | 137 ++++++++++++++++++++++++++++++++++++++
- xbmc/linux/RBP.h                  |   9 +++
- xbmc/windowing/WinEventsLinux.cpp | 125 ++++++++++++++++++++++++++++++++++
- xbmc/windowing/WinEventsLinux.h   |  11 +++
- 5 files changed, 284 insertions(+)
-
-diff --git a/xbmc/guilib/GUIWindowManager.cpp b/xbmc/guilib/GUIWindowManager.cpp
-index 767c7b6..58a196a 100644
---- a/xbmc/guilib/GUIWindowManager.cpp
-+++ b/xbmc/guilib/GUIWindowManager.cpp
-@@ -192,7 +192,9 @@ void CGUIWindowManager::CreateWindows()
-   Add(new CGUIWindowAddonBrowser);
-   Add(new CGUIWindowScreensaverDim);
-   Add(new CGUIWindowDebugInfo);
-+#ifndef TARGET_RASPBERRY_PI
-   Add(new CGUIWindowPointer);
-+#endif
-   Add(new CGUIDialogYesNo);
-   Add(new CGUIDialogProgress);
-   Add(new CGUIDialogExtendedProgressBar);
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index 257c238..13b0504 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -28,6 +28,9 @@
- 
- #include "cores/omxplayer/OMXImage.h"
- 
-+#include "guilib/GraphicContext.h"
-+#include "settings/DisplaySettings.h"
-+
- #include <sys/ioctl.h>
- #include <linux/ioctl.h>
- #include "rpi_user_vcsm.h"
-@@ -47,6 +50,10 @@ CRBP::CRBP()
-   m_OMX             = new COMXCore();
-   m_display = DISPMANX_NO_HANDLE;
-   m_last_pll_adjust = 1.0;
-+  m_p = NULL;
-+  m_x = 0;
-+  m_y = 0;
-+  m_enabled = 0;
-   m_mb = mbox_open();
-   vcsm_init();
- }
-@@ -129,7 +136,10 @@ void CRBP::LogFirmwareVerison()
- DISPMANX_DISPLAY_HANDLE_T CRBP::OpenDisplay(uint32_t device)
- {
-   if (m_display == DISPMANX_NO_HANDLE)
-+  {
-     m_display = vc_dispmanx_display_open( 0 /*screen*/ );
-+    init_cursor();
-+  }
-   return m_display;
- }
- 
-@@ -138,6 +148,7 @@ void CRBP::CloseDisplay(DISPMANX_DISPLAY_HANDLE_T display)
-   assert(display == m_display);
-   vc_dispmanx_display_close(m_display);
-   m_display = DISPMANX_NO_HANDLE;
-+  uninit_cursor();
- }
- 
- void CRBP::GetDisplaySize(int &width, int &height)
-@@ -238,6 +249,9 @@ void CRBP::Deinitialize()
-   m_omx_image_init  = false;
-   m_initialized     = false;
-   m_omx_initialized = false;
-+  uninit_cursor();
-+  delete m_p;
-+  m_p = NULL;
-   if (m_mb)
-     mbox_close(m_mb);
-   m_mb = 0;
-@@ -323,6 +337,52 @@ unsigned mem_unlock(int file_desc, unsigned handle)
-    return p[5];
- }
- 
-+unsigned int mailbox_set_cursor_info(int file_desc, int width, int height, int format, uint32_t buffer, int hotspotx, int hotspoty)
-+{
-+   int i=0;
-+   unsigned int p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x00008010; // set cursor state
-+   p[i++] = 24; // buffer size
-+   p[i++] = 24; // data size
-+
-+   p[i++] = width;
-+   p[i++] = height;
-+   p[i++] = format;
-+   p[i++] = buffer;           // ptr to VC memory buffer. Doesn't work in 64bit....
-+   p[i++] = hotspotx;
-+   p[i++] = hotspoty;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof(*p); // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+
-+}
-+
-+unsigned int mailbox_set_cursor_position(int file_desc, int enabled, int x, int y)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x00008011; // set cursor state
-+   p[i++] = 12; // buffer size
-+   p[i++] = 12; // data size
-+
-+   p[i++] = enabled;
-+   p[i++] = x;
-+   p[i++] = y;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
- CGPUMEM::CGPUMEM(unsigned int numbytes, bool cached)
- {
-   m_numbytes = numbytes;
-@@ -354,4 +414,81 @@ void CGPUMEM::Flush()
-   vcsm_clean_invalid( &iocache );
- }
- 
-+#define T 0
-+#define W 0xffffffff
-+#define B 0xff000000
-+
-+const static uint32_t default_cursor_pixels[] =
-+{
-+   B,B,B,B,B,B,B,B,B,T,T,T,T,T,T,T,
-+   B,W,W,W,W,W,W,B,T,T,T,T,T,T,T,T,
-+   B,W,W,W,W,W,B,T,T,T,T,T,T,T,T,T,
-+   B,W,W,W,W,B,T,T,T,T,T,T,T,T,T,T,
-+   B,W,W,W,W,W,B,T,T,T,T,T,T,T,T,T,
-+   B,W,W,B,W,W,W,B,T,T,T,T,T,T,T,T,
-+   B,W,B,T,B,W,W,W,B,T,T,T,T,T,T,T,
-+   B,B,T,T,T,B,W,W,W,B,T,T,T,T,T,T,
-+   B,T,T,T,T,T,B,W,W,W,B,T,T,T,T,T,
-+   T,T,T,T,T,T,T,B,W,W,W,B,T,T,T,T,
-+   T,T,T,T,T,T,T,T,B,W,W,W,B,T,T,T,
-+   T,T,T,T,T,T,T,T,T,B,W,W,W,B,T,T,
-+   T,T,T,T,T,T,T,T,T,T,B,W,W,W,B,T,
-+   T,T,T,T,T,T,T,T,T,T,T,B,W,W,W,B,
-+   T,T,T,T,T,T,T,T,T,T,T,T,B,W,B,T,
-+   T,T,T,T,T,T,T,T,T,T,T,T,T,B,T,T
-+};
-+
-+#undef T
-+#undef W
-+#undef B
-+
-+void CRBP::init_cursor()
-+{
-+  if (!m_mb)
-+    return;
-+  if (!m_p)
-+    m_p = new CGPUMEM(64 * 64 * 4, false);
-+  if (m_p && m_p->m_arm && m_p->m_vc)
-+    set_cursor(default_cursor_pixels, 16, 16, 0, 0);
-+}
-+
-+void CRBP::set_cursor(const void *pixels, int width, int height, int hotspot_x, int hotspot_y)
-+{
-+  if (!m_mb || !m_p || !m_p->m_arm || !m_p->m_vc || !pixels || width * height > 64 * 64)
-+    return;
-+  memcpy(m_p->m_arm, pixels, width * height * 4);
-+  unsigned int s = mailbox_set_cursor_info(m_mb, width, height, 0, m_p->m_vc, hotspot_x, hotspot_y);
-+  assert(s == 0);
-+}
-+
-+void CRBP::update_cursor(int x, int y, bool enabled)
-+{
-+  if (!m_mb || !m_p || !m_p->m_arm || !m_p->m_vc)
-+    return;
-+
-+  RESOLUTION res = g_graphicsContext.GetVideoResolution();
-+  CRect gui(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iHeight);
-+  CRect display(0, 0, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenWidth, CDisplaySettings::GetInstance().GetResolutionInfo(res).iScreenHeight);
-+
-+  int x2 = x * display.Width()  / gui.Width();
-+  int y2 = y * display.Height() / gui.Height();
-+
-+  if (g_graphicsContext.GetStereoMode() == RENDER_STEREO_MODE_SPLIT_HORIZONTAL)
-+    y2 *= 2;
-+  else if (g_graphicsContext.GetStereoMode() == RENDER_STEREO_MODE_SPLIT_VERTICAL)
-+    x2 *= 2;
-+  if (m_x != x2 || m_y != y2 || m_enabled != enabled)
-+    mailbox_set_cursor_position(m_mb, enabled, x2, y2);
-+  m_x = x2;
-+  m_y = y2;
-+  m_enabled = enabled;
-+}
-+
-+void CRBP::uninit_cursor()
-+{
-+  if (!m_mb || !m_p || !m_p->m_arm || !m_p->m_vc)
-+    return;
-+  mailbox_set_cursor_position(m_mb, 0, 0, 0);
-+}
-+
- #endif
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index ab24bbe..2eee35d 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -99,6 +99,15 @@ private:
-   CCriticalSection m_critSection;
-   double m_last_pll_adjust;
-   int m_mb;
-+  CGPUMEM *m_p;
-+  int m_x;
-+  int m_y;
-+  bool m_enabled;
-+  public:
-+  void init_cursor();
-+  void set_cursor(const void *pixels, int width, int height, int hotspot_x, int hotspot_y);
-+  void update_cursor(int x, int y, bool enabled);
-+  void uninit_cursor();
- };
- 
- extern CRBP g_RBP;
-diff --git a/xbmc/windowing/WinEventsLinux.cpp b/xbmc/windowing/WinEventsLinux.cpp
-index aaba119..2b3d77a 100644
---- a/xbmc/windowing/WinEventsLinux.cpp
-+++ b/xbmc/windowing/WinEventsLinux.cpp
-@@ -30,11 +30,26 @@
- #include "utils/log.h"
- #include "powermanagement/PowerManager.h"
- 
-+#ifdef TARGET_RASPBERRY_PI
-+#include "utils/TimeUtils.h"
-+#include "guilib/Resolution.h"
-+#include "addons/Skin.h"
-+#include "utils/XMLUtils.h"
-+#include "utils/StringUtils.h"
-+#include "guilib/Texture.h"
-+#include "linux/RBP.h"
-+#include "input/InputManager.h"
-+#endif
-+
- bool CWinEventsLinux::m_initialized = false;
- CLinuxInputDevices CWinEventsLinux::m_devices;
- 
- CWinEventsLinux::CWinEventsLinux()
- {
-+#ifdef TARGET_RASPBERRY_PI
-+  m_last_mouse_move_time = 0;
-+  m_mouse_state = -1;
-+#endif
- }
- 
- void CWinEventsLinux::RefreshDevices()
-@@ -48,19 +63,122 @@ bool CWinEventsLinux::IsRemoteLowBattery()
-   return false;
- }
- 
-+#ifdef TARGET_RASPBERRY_PI
-+bool CWinEventsLinux::LoadXML(const std::string strFileName)
-+{
-+  RESOLUTION_INFO m_coordsRes; // resolution that the window coordinates are in.
-+  // Find appropriate skin folder + resolution to load from
-+  std::string strFileNameLower = strFileName;
-+  StringUtils::ToLower(strFileNameLower);
-+  std::string strLowerPath = g_SkinInfo->GetSkinPath(strFileNameLower, &m_coordsRes);
-+  std::string strPath = g_SkinInfo->GetSkinPath(strFileName, &m_coordsRes);
-+
-+  TiXmlElement* pRootElement = NULL;
-+  CXBMCTinyXML xmlDoc;
-+  std::string strPathLower = strPath;
-+  StringUtils::ToLower(strPathLower);
-+  if (!xmlDoc.LoadFile(strPath) && !xmlDoc.LoadFile(strPathLower) && !xmlDoc.LoadFile(strLowerPath))
-+  {
-+    CLog::Log(LOGERROR, "unable to load:%s, Line %d\n%s", strPath.c_str(), xmlDoc.ErrorRow(), xmlDoc.ErrorDesc());
-+    return false;
-+  }
-+  pRootElement = (TiXmlElement*)xmlDoc.RootElement()->Clone();
-+
-+  if (!pRootElement)
-+    return false;
-+
-+  if (strcmpi(pRootElement->Value(), "window"))
-+  {
-+    CLog::Log(LOGERROR, "file : XML file doesnt contain <window>");
-+    return false;
-+  }
-+
-+  TiXmlElement *pChild = pRootElement->FirstChildElement();
-+  while (pChild)
-+  {
-+    if (strcmpi(pChild->Value(), "controls") == 0)
-+    {
-+      TiXmlElement *pControl = pChild->FirstChildElement();
-+      while (pControl)
-+      {
-+        if (strcmpi(pControl->Value(), "control") == 0)
-+        {
-+          std::string strStringValue;
-+          if (XMLUtils::GetString(pControl, "texture", strStringValue))
-+          {
-+            const char* idAttr = pControl->Attribute("id");
-+            int index = idAttr ? atoi(idAttr)-1 : -1;
-+            if (index >= 0 && index < (int)(sizeof m_cursors/sizeof *m_cursors))
-+            {
-+              if (m_cursors[index].m_filename.size())
-+                g_TextureManager.ReleaseTexture(m_cursors[index].m_filename, true);
-+              m_cursors[index].m_filename.clear();
-+              m_cursors[index].m_texture = g_TextureManager.Load(strStringValue);
-+              if (m_cursors[index].m_texture.size())
-+                m_cursors[index].m_filename = strStringValue;
-+            }
-+          }
-+        }
-+        pControl = pControl->NextSiblingElement();
-+      }
-+    }
-+    pChild = pChild->NextSiblingElement();
-+  }
-+  delete pRootElement;
-+  return true;
-+}
-+#endif
-+
- bool CWinEventsLinux::MessagePump()
- {
-   if (!m_initialized)
-   {
-     m_devices.InitAvailable();
-     m_initialized = true;
-+#ifdef TARGET_RASPBERRY_PI
-+    LoadXML("Pointer.xml");
-+#endif
-   }
- 
-   bool ret = false;
-   XBMC_Event event = {0};
-+#ifdef TARGET_RASPBERRY_PI
-+  bool active = CInputManager::GetInstance().IsMouseActive();
-+  int64_t Now = CurrentHostCounter();
-+  if (!active)
-+  {
-+    if (m_mouse_state != -1)
-+    {
-+      g_RBP.update_cursor(0, 0, 0);
-+      m_mouse_state = -1;
-+    }
-+  }
-+  else
-+  {
-+    int state = CInputManager::GetInstance().GetMouseState() - 1;
-+    if (m_mouse_state != state)
-+    {
-+      if (state >= 0 && state < (int)(sizeof m_cursors/sizeof *m_cursors))
-+      {
-+        CBaseTexture *t = (m_cursors[state].m_texture.m_textures)[0];
-+        if (t)
-+          g_RBP.set_cursor((const void *)t->GetPixels(), t->GetPitch()>>2, t->GetRows(), 0, 0);
-+      }
-+      m_mouse_state = state;
-+    }
-+  }
-+#endif
-   while (1)
-   {
-     event = m_devices.ReadEvent();
-+#ifdef TARGET_RASPBERRY_PI
-+    if (active && (event.type == XBMC_MOUSEMOTION || event.type == XBMC_MOUSEBUTTONDOWN || event.type == XBMC_MOUSEBUTTONUP))
-+    {
-+      if (event.type == XBMC_MOUSEMOTION)
-+        g_RBP.update_cursor(event.motion.x, event.motion.y, 1);
-+      m_last_mouse_move_time = Now;
-+    }
-+#endif
-     if (event.type != XBMC_NOEVENT)
-     {
-       ret |= g_application.OnEvent(event);
-@@ -71,6 +189,13 @@ bool CWinEventsLinux::MessagePump()
-     }
-   }
- 
-+#ifdef TARGET_RASPBERRY_PI
-+  if (active && Now - m_last_mouse_move_time > 5 * 1000000000LL)
-+  {
-+    g_RBP.update_cursor(0, 0, 0);
-+    m_mouse_state = -1;
-+  }
-+#endif
-   return ret;
- }
- 
-diff --git a/xbmc/windowing/WinEventsLinux.h b/xbmc/windowing/WinEventsLinux.h
-index a17e987..23244a2 100644
---- a/xbmc/windowing/WinEventsLinux.h
-+++ b/xbmc/windowing/WinEventsLinux.h
-@@ -24,6 +24,7 @@
- #pragma once
- #include "windowing/WinEvents.h"
- #include "input/linux/LinuxInputDevices.h"
-+#include "guilib/TextureManager.h"
- 
- class CWinEventsLinux : public IWinEvents
- {
-@@ -43,6 +44,16 @@ public:
- private:
-   static bool m_initialized;
-   static CLinuxInputDevices m_devices;
-+#ifdef TARGET_RASPBERRY_PI
-+  bool LoadXML(const std::string strFileName);
-+  int64_t m_last_mouse_move_time;
-+  struct
-+  {
-+    std::string m_filename;
-+    CTextureArray m_texture;
-+  } m_cursors[4];
-+  int m_mouse_state;
-+#endif
- };
- 
- #endif
-
-From fbd04377a1dac080166e1e4baa2250f402e3b66f Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 2 Aug 2014 17:48:04 +0100
-Subject: [PATCH 53/93] [omx] Report decoded image name
-
----
- xbmc/cores/omxplayer/OMXImage.cpp | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/xbmc/cores/omxplayer/OMXImage.cpp b/xbmc/cores/omxplayer/OMXImage.cpp
-index 1524214..a01c435 100644
---- a/xbmc/cores/omxplayer/OMXImage.cpp
-+++ b/xbmc/cores/omxplayer/OMXImage.cpp
-@@ -327,6 +327,7 @@ bool COMXImage::DecodeJpegToTexture(COMXImageFile *file, unsigned int width, uns
-   {
-     ret = true;
-     *userdata = tex;
-+    CLog::Log(LOGDEBUG, "%s: decoded %s %dx%d", __func__, file->GetFilename(), width, height);
-   }
-   else
-   {
-
-From 99d06dd14a4501fe81b36e8ce3966dc99cd04b94 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 11 Apr 2014 16:12:27 +0100
-Subject: [PATCH 54/93] [omxplayer] Add ability to log more timestamp info in
- extra debug settings
-
----
- addons/resource.language.en_gb/resources/strings.po |  5 +++++
- xbmc/commons/ilog.h                                 |  1 +
- xbmc/cores/omxplayer/OMXHelper.cpp                  | 12 +++++++-----
- xbmc/cores/omxplayer/OMXPlayerAudio.cpp             |  8 ++++----
- xbmc/cores/omxplayer/OMXPlayerVideo.cpp             |  9 +++++----
- xbmc/settings/AdvancedSettings.cpp                  |  3 +++
- 6 files changed, 25 insertions(+), 13 deletions(-)
-
-diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index a1da64b..1fb7988 100644
---- a/addons/resource.language.en_gb/resources/strings.po
-+++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -2989,6 +2989,11 @@ msgctxt "#680"
- msgid "Verbose logging for the [B]Video[/B] component"
- msgstr ""
- 
-+#: xbmc/settings/AdvancedSettings.cpp
-+msgctxt "#697"
-+msgid "Verbose logging for OMXPLAYER"
-+msgstr ""
-+
- #empty strings from id 681 to 699
- 
- msgctxt "#700"
-diff --git a/xbmc/commons/ilog.h b/xbmc/commons/ilog.h
-index de90359..e4ffb5e 100644
---- a/xbmc/commons/ilog.h
-+++ b/xbmc/commons/ilog.h
-@@ -53,6 +53,7 @@
- #define LOGUPNP     (1 << (LOGMASKBIT + 8))
- #define LOGCEC      (1 << (LOGMASKBIT + 9))
- #define LOGVIDEO    (1 << (LOGMASKBIT + 10))
-+#define LOGOMXPLAYER (1 << (LOGMASKBIT + 16))
- 
- #include "utils/params_check_macros.h"
- 
-diff --git a/xbmc/cores/omxplayer/OMXHelper.cpp b/xbmc/cores/omxplayer/OMXHelper.cpp
-index 3429cea..59c3a61 100644
---- a/xbmc/cores/omxplayer/OMXHelper.cpp
-+++ b/xbmc/cores/omxplayer/OMXHelper.cpp
-@@ -23,6 +23,7 @@
- #ifdef HAS_OMXPLAYER
- 
- #include "DVDPlayer.h"
-+#include "settings/AdvancedSettings.h"
- #include "settings/Settings.h"
- #include "settings/MediaSettings.h"
- #include "DVDInputStreams/DVDInputStream.h"
-@@ -155,7 +156,8 @@ bool OMXDoProcessing(struct SOmxPlayerState &m_OmxPlayerState, int m_playSpeed,
-     m_OmxPlayerState.video_fifo = (int)(100.0*(m_dvdPlayerVideo->GetDecoderBufferSize()-m_dvdPlayerVideo->GetDecoderFreeSpace())/m_dvdPlayerVideo->GetDecoderBufferSize());
-     m_OmxPlayerState.audio_fifo = (int)(100.0*audio_fifo/m_dvdPlayerAudio->GetCacheTotal());
- 
--    #ifdef _DEBUG
-+  if (g_advancedSettings.CanLogComponent(LOGOMXPLAYER))
-+  {
-     static unsigned count;
-     if ((count++ & 7) == 0)
-     {
-@@ -175,7 +177,7 @@ bool OMXDoProcessing(struct SOmxPlayerState &m_OmxPlayerState, int m_playSpeed,
-       vc_gencmd(response, sizeof response, "render_bar 7 audio_queue %d %d %d %d",
-             m_dvdPlayerAudio->GetLevel(), 0, 0, 100);
-     }
--    #endif
-+  }
-     if (audio_pts != DVD_NOPTS_VALUE)
-     {
-       audio_fifo_low = m_HasAudio && audio_fifo < threshold;
-@@ -191,15 +193,15 @@ bool OMXDoProcessing(struct SOmxPlayerState &m_OmxPlayerState, int m_playSpeed,
-     if (!m_HasVideo && m_HasAudio)
-       video_fifo_high = true;
- 
--    #ifdef _DEBUG
-+  if (g_advancedSettings.CanLogComponent(LOGOMXPLAYER))
-+  {
-     CLog::Log(LOGDEBUG, "%s::%s M:%.6f-%.6f (A:%.6f V:%.6f) PEF:%d%d%d S:%.2f A:%.2f V:%.2f/T:%.2f (A:%d%d V:%d%d) A:%d%% V:%d%% (%.2f,%.2f)", "CDVDPlayer", __FUNCTION__,
-       m_OmxPlayerState.stamp*1e-6, m_OmxPlayerState.av_clock.OMXClockAdjustment()*1e-6, audio_pts*1e-6, video_pts*1e-6,
-       m_OmxPlayerState.av_clock.OMXIsPaused(), m_OmxPlayerState.bOmxSentEOFs, not_accepts_data, m_playSpeed * (1.0f/DVD_PLAYSPEED_NORMAL),
-       audio_pts == DVD_NOPTS_VALUE ? 0.0:audio_fifo, video_pts == DVD_NOPTS_VALUE ? 0.0:video_fifo, m_OmxPlayerState.threshold,
-       audio_fifo_low, audio_fifo_high, video_fifo_low, video_fifo_high,
-       m_dvdPlayerAudio->GetLevel(), m_dvdPlayerVideo->GetLevel(), m_dvdPlayerAudio->GetDelay(), (float)m_dvdPlayerAudio->GetCacheTotal());
--    #endif
--
-+  }
-     if(!m_Pause && (m_OmxPlayerState.bOmxSentEOFs || not_accepts_data || (audio_fifo_high && video_fifo_high) || m_playSpeed != DVD_PLAYSPEED_NORMAL))
-     {
-       if (m_OmxPlayerState.av_clock.OMXIsPaused())
-diff --git a/xbmc/cores/omxplayer/OMXPlayerAudio.cpp b/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-index 1c3b904..2056962 100644
---- a/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-+++ b/xbmc/cores/omxplayer/OMXPlayerAudio.cpp
-@@ -366,10 +366,10 @@ void OMXPlayerAudio::Process()
-       DemuxPacket* pPacket = ((CDVDMsgDemuxerPacket*)pMsg)->GetPacket();
-       bool bPacketDrop     = ((CDVDMsgDemuxerPacket*)pMsg)->GetPacketDrop();
- 
--      #ifdef _DEBUG
--      CLog::Log(LOGINFO, "Audio: dts:%.0f pts:%.0f size:%d (s:%d f:%d d:%d l:%d) s:%d %d/%d late:%d,%d", pPacket->dts, pPacket->pts,
--           (int)pPacket->iSize, m_started, m_flush, bPacketDrop, m_stalled, m_speed, 0, 0, (int)m_omxAudio.GetAudioRenderingLatency(), (int)m_hints_current.samplerate);
--      #endif
-+      if (g_advancedSettings.CanLogComponent(LOGOMXPLAYER))
-+        CLog::Log(LOGINFO, "Audio: dts:%.0f pts:%.0f size:%d (s:%d f:%d d:%d l:%d) s:%d %d/%d late:%d,%d", pPacket->dts, pPacket->pts,
-+             (int)pPacket->iSize, m_started, m_flush, bPacketDrop, m_stalled, m_speed, 0, 0, (int)m_omxAudio.GetAudioRenderingLatency(), (int)m_hints_current.samplerate);
-+
-       if(Decode(pPacket, m_speed > DVD_PLAYSPEED_NORMAL || m_speed < 0 || bPacketDrop))
-       {
-         // we are not running until something is cached in output device
-diff --git a/xbmc/cores/omxplayer/OMXPlayerVideo.cpp b/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-index 0e04360..7c34e10 100644
---- a/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-+++ b/xbmc/cores/omxplayer/OMXPlayerVideo.cpp
-@@ -42,6 +42,7 @@
- #include "DVDOverlayRenderer.h"
- #include "settings/DisplaySettings.h"
- #include "settings/Settings.h"
-+#include "settings/AdvancedSettings.h"
- #include "settings/MediaSettings.h"
- #include "cores/VideoRenderers/RenderFormats.h"
- #include "cores/VideoRenderers/RenderFlags.h"
-@@ -452,10 +453,10 @@ void OMXPlayerVideo::Process()
-       DemuxPacket* pPacket = ((CDVDMsgDemuxerPacket*)pMsg)->GetPacket();
-       bool bPacketDrop     = ((CDVDMsgDemuxerPacket*)pMsg)->GetPacketDrop();
- 
--      #ifdef _DEBUG
--      CLog::Log(LOGINFO, "Video: dts:%.0f pts:%.0f size:%d (s:%d f:%d d:%d l:%d) s:%d %d/%d late:%d\n", pPacket->dts, pPacket->pts, 
--          (int)pPacket->iSize, m_started, m_flush, bPacketDrop, m_stalled, m_speed, 0, 0, 0);
--      #endif
-+      if (g_advancedSettings.CanLogComponent(LOGOMXPLAYER))
-+        CLog::Log(LOGINFO, "Video: dts:%.0f pts:%.0f size:%d (s:%d f:%d d:%d l:%d) s:%d %d/%d late:%d\n", pPacket->dts, pPacket->pts,
-+            (int)pPacket->iSize, m_started, m_flush, bPacketDrop, m_stalled, m_speed, 0, 0, 0);
-+
-       if (m_messageQueue.GetDataSize() == 0
-       ||  m_speed < 0)
-       {
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 22b8459..8045a03 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -1359,6 +1359,9 @@ void CAdvancedSettings::SettingOptionsLoggingComponentsFiller(const CSetting *se
- #ifdef HAVE_LIBCEC
-   list.push_back(std::make_pair(g_localizeStrings.Get(679), LOGCEC));
- #endif
-+#ifdef TARGET_RASPBERRY_PI
-+  list.push_back(std::make_pair(g_localizeStrings.Get(697), LOGOMXPLAYER));
-+#endif
- }
- 
- void CAdvancedSettings::setExtraLogLevel(const std::vector<CVariant> &components)
-
-From 85731b224b68bac5a47774d5447bbd3e1d14236d Mon Sep 17 00:00:00 2001
-From: Memphiz <memphis@machzwo.de>
-Date: Tue, 18 Nov 2014 13:28:36 +0100
-Subject: [PATCH 55/93] - evaluate the setting for hiding watched
- movies/episodes/musicvideos in recently added job (should influence
- homescreen of skins only)
-
----
- xbmc/utils/RecentlyAddedJob.cpp | 10 +++++-----
- xbmc/video/VideoDatabase.cpp    | 27 ++++++++++++++++++++++++---
- xbmc/video/VideoDatabase.h      |  6 +++---
- 3 files changed, 32 insertions(+), 11 deletions(-)
-
-diff --git a/xbmc/utils/RecentlyAddedJob.cpp b/xbmc/utils/RecentlyAddedJob.cpp
-index de98a51..19eea07 100644
---- a/xbmc/utils/RecentlyAddedJob.cpp
-+++ b/xbmc/utils/RecentlyAddedJob.cpp
-@@ -30,6 +30,7 @@
- #include "music/tags/MusicInfoTag.h"
- #include "utils/StringUtils.h"
- #include "settings/AdvancedSettings.h"
-+#include "settings/Settings.h"
- #include "music/MusicThumbLoader.h"
- #include "video/VideoThumbLoader.h"
- 
-@@ -56,8 +57,8 @@ bool CRecentlyAddedJob::UpdateVideo()
-   loader.OnLoaderStart();
-   
-   videodatabase.Open();
--
--  if (videodatabase.GetRecentlyAddedMoviesNav("videodb://recentlyaddedmovies/", items, NUM_ITEMS))
-+  bool hideWatched = CSettings::GetInstance().GetBool("videolibrary.hiderecentlywatchedvideos");
-+  if (videodatabase.GetRecentlyAddedMoviesNav("videodb://recentlyaddedmovies/", items, NUM_ITEMS, hideWatched))
-   {  
-     for (; i < items.Size(); ++i)
-     {
-@@ -96,8 +97,7 @@ bool CRecentlyAddedJob::UpdateVideo()
-  
-   i = 0;
-   CFileItemList  TVShowItems; 
-- 
--  if (videodatabase.GetRecentlyAddedEpisodesNav("videodb://recentlyaddedepisodes/", TVShowItems, NUM_ITEMS))
-+  if (videodatabase.GetRecentlyAddedEpisodesNav("videodb://recentlyaddedepisodes/", TVShowItems, NUM_ITEMS, hideWatched))
-   {
-     for (; i < TVShowItems.Size(); ++i)
-     {    
-@@ -150,7 +150,7 @@ bool CRecentlyAddedJob::UpdateVideo()
-   i = 0;
-   CFileItemList MusicVideoItems;
- 
--  if (videodatabase.GetRecentlyAddedMusicVideosNav("videodb://recentlyaddedmusicvideos/", MusicVideoItems, NUM_ITEMS))
-+  if (videodatabase.GetRecentlyAddedMusicVideosNav("videodb://recentlyaddedmusicvideos/", MusicVideoItems, NUM_ITEMS, hideWatched))
-   {
-     for (; i < MusicVideoItems.Size(); ++i)
-     {
-diff --git a/xbmc/video/VideoDatabase.cpp b/xbmc/video/VideoDatabase.cpp
-index b56e2e8..6db3c7e 100644
---- a/xbmc/video/VideoDatabase.cpp
-+++ b/xbmc/video/VideoDatabase.cpp
-@@ -6466,27 +6466,48 @@ bool CVideoDatabase::GetMusicVideosNav(const std::string& strBaseDir, CFileItemL
-   return GetMusicVideosByWhere(videoUrl.ToString(), filter, items, true, sortDescription);
- }
- 
--bool CVideoDatabase::GetRecentlyAddedMoviesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit)
-+bool CVideoDatabase::GetRecentlyAddedMoviesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit, bool hideWatched)
- {
-   Filter filter;
-   filter.order = "dateAdded desc, idMovie desc";
-   filter.limit = PrepareSQL("%u", limit ? limit : g_advancedSettings.m_iVideoLibraryRecentlyAddedItems);
-+
-+  if (hideWatched)
-+  {
-+    filter.AppendWhere("playCount <= 0");// only query unwatched items
-+    filter.AppendWhere("playCount IS NULL", false);
-+  }
-+
-   return GetMoviesByWhere(strBaseDir, filter, items);
- }
- 
--bool CVideoDatabase::GetRecentlyAddedEpisodesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit)
-+bool CVideoDatabase::GetRecentlyAddedEpisodesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit, bool hideWatched)
- {
-   Filter filter;
-   filter.order = "dateAdded desc, idEpisode desc";
-   filter.limit = PrepareSQL("%u", limit ? limit : g_advancedSettings.m_iVideoLibraryRecentlyAddedItems);
-+
-+  if (hideWatched)
-+  {
-+    filter.AppendWhere("playCount <= 0");// only query unwatched items
-+    filter.AppendWhere("playCount IS NULL", false);
-+  }
-+
-   return GetEpisodesByWhere(strBaseDir, filter, items, false);
- }
- 
--bool CVideoDatabase::GetRecentlyAddedMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit)
-+bool CVideoDatabase::GetRecentlyAddedMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit, bool hideWatched)
- {
-   Filter filter;
-   filter.order = "dateAdded desc, idMVideo desc";
-   filter.limit = PrepareSQL("%u", limit ? limit : g_advancedSettings.m_iVideoLibraryRecentlyAddedItems);
-+
-+  if (hideWatched)
-+  {
-+    filter.AppendWhere("playCount <= 0");// only query unwatched items
-+    filter.AppendWhere("playCount IS NULL", false);
-+  }
-+
-   return GetMusicVideosByWhere(strBaseDir, filter, items);
- }
- 
-diff --git a/xbmc/video/VideoDatabase.h b/xbmc/video/VideoDatabase.h
-index 2021dd9..5f67d10 100644
---- a/xbmc/video/VideoDatabase.h
-+++ b/xbmc/video/VideoDatabase.h
-@@ -693,9 +693,9 @@ public:
-   bool GetEpisodesNav(const std::string& strBaseDir, CFileItemList& items, int idGenre=-1, int idYear=-1, int idActor=-1, int idDirector=-1, int idShow=-1, int idSeason=-1, const SortDescription &sortDescription = SortDescription());
-   bool GetMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, int idGenre=-1, int idYear=-1, int idArtist=-1, int idDirector=-1, int idStudio=-1, int idAlbum=-1, int idTag=-1, const SortDescription &sortDescription = SortDescription());
-   
--  bool GetRecentlyAddedMoviesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0);
--  bool GetRecentlyAddedEpisodesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0);
--  bool GetRecentlyAddedMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0);
-+  bool GetRecentlyAddedMoviesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0, bool hideWatched=false);
-+  bool GetRecentlyAddedEpisodesNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0, bool hideWatched=false);
-+  bool GetRecentlyAddedMusicVideosNav(const std::string& strBaseDir, CFileItemList& items, unsigned int limit=0, bool hideWatched=false);
- 
-   bool HasContent();
-   bool HasContent(VIDEODB_CONTENT_TYPE type);
-
-From 11a94854f5d433c121314cdf989fd0e7bcc0102b Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 7 May 2015 14:28:37 +0100
-Subject: [PATCH 56/93] build: Add vcsm lib
-
----
- configure.ac | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/configure.ac b/configure.ac
-index 7a06a31..239a2a1 100644
---- a/configure.ac
-+++ b/configure.ac
-@@ -1012,7 +1012,7 @@ if test "$use_gles" = "yes"; then
-       AC_DEFINE([HAVE_LIBEGL],[1],["Define to 1 if you have the `EGL' library (-lEGL)."])
-       AC_DEFINE([HAVE_LIBGLESV2],[1],["Define to 1 if you have the `GLESv2' library (-lGLESv2)."])
-       AC_MSG_RESULT(== WARNING: OpenGLES support is assumed.)
--      LIBS="$LIBS -lEGL -lGLESv2 -lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util"
-+      LIBS="$LIBS -lEGL -lGLESv2 -lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm"
-     else
-       AC_CHECK_LIB([EGL],   [main],, AC_MSG_ERROR($missing_library))
-       AC_CHECK_LIB([GLESv2],[main],, AC_MSG_ERROR($missing_library))
-
-From 0077cb637667dc35084234c792dedfc8c5e80485 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 8 May 2015 14:09:31 +0100
-Subject: [PATCH 57/93] [cec] re-implement 'RFC' style POLLing for LA
- registering process
-
----
- ...t-RFC-style-POLLing-for-LA-registering-pr.patch | 194 +++++++++++++++++++++
- tools/depends/target/libcec/Makefile               |   1 +
- 2 files changed, 195 insertions(+)
- create mode 100644 tools/depends/target/libcec/0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
-
-diff --git a/tools/depends/target/libcec/0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch b/tools/depends/target/libcec/0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
-new file mode 100644
-index 0000000..24bf69f
---- /dev/null
-+++ b/tools/depends/target/libcec/0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
-@@ -0,0 +1,194 @@
-+From 8b8b9cd9d3821514d02d53214cf65af5b54543ed Mon Sep 17 00:00:00 2001
-+From: Matus Kral <matuskral@me.com>
-+Date: Fri, 8 May 2015 14:48:48 +0200
-+Subject: [PATCH] re-implement 'RFC' style POLLing for LA registering process
-+ (org A = dest A)
-+
-+- initially, sending empty asymetric msg (from X to Y) seemed to
-+  deliver same result. It is not - there are devices responding with NACK
-+  to msg send attempt in case they are busy (already receiving msg).
-+  ACK is returned only on 'RFC' POLL msg.
-+---
-+ .../adapter/RPi/RPiCECAdapterCommunication.cpp     | 49 +++++++++++++++++++++-
-+ .../adapter/RPi/RPiCECAdapterCommunication.h       |  3 ++
-+ .../adapter/RPi/RPiCECAdapterMessageQueue.cpp      | 40 ++++++++++++++----
-+ 3 files changed, 81 insertions(+), 11 deletions(-)
-+
-+diff --git a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-+index 1e93838..6f0804d 100644
-+--- a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-++++ b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-+@@ -70,7 +70,8 @@ CRPiCECAdapterCommunication::CRPiCECAdapterCommunication(IAdapterCommunicationCa
-+     m_logicalAddress(CECDEVICE_UNKNOWN),
-+     m_bLogicalAddressChanged(false),
-+     m_previousLogicalAddress(CECDEVICE_FREEUSE),
-+-    m_bLogicalAddressRegistered(false)
-++    m_bLogicalAddressRegistered(false),
-++    m_bDisableCallbacks(false)
-+ {
-+   m_queue = new CRPiCECAdapterMessageQueue(this);
-+ }
-+@@ -140,6 +141,12 @@ void CRPiCECAdapterCommunication::OnTVServiceCallback(uint32_t reason, uint32_t
-+ 
-+ void CRPiCECAdapterCommunication::OnDataReceived(uint32_t header, uint32_t p0, uint32_t p1, uint32_t p2, uint32_t p3)
-+ {
-++  {
-++    CLockObject lock(m_mutex);
-++    if (m_bDisableCallbacks)
-++      return;
-++  }
-++
-+   VC_CEC_NOTIFY_T reason = (VC_CEC_NOTIFY_T)CEC_CB_REASON(header);
-+ 
-+ #ifdef CEC_DEBUGGING
-+@@ -363,12 +370,50 @@ std::string CRPiCECAdapterCommunication::GetError(void) const
-+   return strError;
-+ }
-+ 
-++void CRPiCECAdapterCommunication::SetDisableCallback(const bool disable)
-++{
-++  CLockObject lock(m_mutex);
-++  m_bDisableCallbacks = disable;
-++}
-++
-+ cec_adapter_message_state CRPiCECAdapterCommunication::Write(const cec_command &data, bool &bRetry, uint8_t iLineTimeout, bool bIsReply)
-+ {
-+   VC_CEC_ERROR_T vcAnswer;
-+   uint32_t iTimeout = (data.transmit_timeout ? data.transmit_timeout : iLineTimeout*1000);
-++  cec_adapter_message_state rc;
-++
-++  // to send a real POLL (dest & source LA the same - eg 11), VC
-++  // needs us to be in passivemode(we are) and with no actual LA
-++  // registered
-++  // libCEC sends 'true' POLLs only when at LA choosing process.
-++  // any other POLLing of devices happens with regular 'empty'
-++  // msg (just header, no OPCODE) with actual LA as source to X.
-++  // for us this means, that libCEC already registered tmp LA
-++  // (0xf, 0xe respectively) before it calls us for LA POLLing.
-++  //
-++  // that means - unregistering any A from adapter, _while_
-++  // ignoring callbacks (and especialy not reporting the
-++  // subsequent actions generated from VC layer - like
-++  // LA change to 0xf ...)
-++  //
-++  // calling vc_cec_release_logical_address() over and over is
-++  // fine.
-++  // once libCEC gets NACK on tested A, it calls RegisterLogicalAddress()
-++  // on it's own - so we don't need to take care of re-registering
-++  if (!data.opcode_set && data.initiator == data.destination)
-++  {
-++    SetDisableCallback(true);
-++
-++    vc_cec_release_logical_address();
-++    // accept nothing else than NACK or ACK, repeat until this happens
-++    while (ADAPTER_MESSAGE_STATE_WAITING_TO_BE_SENT ==
-++          (rc = m_queue->Write(data, bRetry, iTimeout, bIsReply, vcAnswer)));
-++
-++    SetDisableCallback(false);
-++    return rc;
-++  }
-+ 
-+-  cec_adapter_message_state rc = m_queue->Write(data, bRetry, iTimeout, bIsReply, vcAnswer);
-++  rc = m_queue->Write(data, bRetry, iTimeout, bIsReply, vcAnswer);
-+ #ifdef CEC_DEBUGGING
-+   LIB_CEC->AddLog(CEC_LOG_DEBUG, "sending data: result %s", ToString(vcAnswer));
-+ #endif
-+diff --git a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.h b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.h
-+index ba6d000..6024a27 100644
-+--- a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.h
-++++ b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.h
-+@@ -100,6 +100,7 @@ namespace CEC
-+     bool UnregisterLogicalAddress(void);
-+     bool RegisterLogicalAddress(const cec_logical_address address, uint32_t iTimeoutMs = CEC_DEFAULT_CONNECT_TIMEOUT);
-+     int InitHostCEC(void);
-++    void SetDisableCallback(const bool disable);
-+ 
-+     bool m_bInitialised;   /**< true when the connection is initialised, false otherwise */
-+     std::string m_strError; /**< current error message */
-+@@ -113,6 +114,8 @@ namespace CEC
-+     VCHI_CONNECTION_T *         m_vchi_connection;
-+     cec_logical_address         m_previousLogicalAddress;
-+     bool                        m_bLogicalAddressRegistered;
-++
-++    bool                        m_bDisableCallbacks;
-+   };
-+ };
-+ 
-+diff --git a/src/libcec/adapter/RPi/RPiCECAdapterMessageQueue.cpp b/src/libcec/adapter/RPi/RPiCECAdapterMessageQueue.cpp
-+index 361ba38..169201d 100644
-+--- a/src/libcec/adapter/RPi/RPiCECAdapterMessageQueue.cpp
-++++ b/src/libcec/adapter/RPi/RPiCECAdapterMessageQueue.cpp
-+@@ -53,10 +53,13 @@ using namespace PLATFORM;
-+ 
-+ #define LIB_CEC m_com->m_callback->GetLib()
-+ 
-++// initialise new msg with unsuccesfull status, also
-++// set default return state to "UNKNOWN" - instead
-++// of NACK (which has special meaning for dev POLLing)
-+ CRPiCECAdapterMessageQueueEntry::CRPiCECAdapterMessageQueueEntry(CRPiCECAdapterMessageQueue *queue, const cec_command &command) :
-+     m_queue(queue),
-+     m_command(command),
-+-    m_retval(VC_CEC_ERROR_NO_ACK),
-++    m_retval(VC_CEC_ERROR_BUSY),
-+     m_bSucceeded(false)
-+ {
-+ 
-+@@ -130,6 +133,27 @@ uint32_t CRPiCECAdapterMessageQueueEntry::Result() const
-+ 
-+ cec_adapter_message_state CRPiCECAdapterMessageQueue::Write(const cec_command &command, bool &bRetry, uint32_t iLineTimeout, bool bIsReply, VC_CEC_ERROR_T &vcReply)
-+ {
-++  // handle POLL (msg like '11') in a special way - the way it was
-++  // originally designed by BCM, expected to happen and documented
-++  // in API docs (/opt/vc/includes)
-++  // due to often (more than 20% test cases - CEC bus with 8 devices)
-++  // irregularities on returned status, repeat until we get SAME
-++  // result twice in a row
-++  if (!command.opcode_set && command.destination == command.initiator)
-++  {
-++    int iReturnPrev = -1;
-++    int iReturn = 0;
-++
-++    while((iReturn = vc_cec_poll_address((CEC_AllDevices_T)command.destination)) != iReturnPrev)
-++      iReturnPrev = iReturn;
-++    if (iReturn == 0)
-++      return ADAPTER_MESSAGE_STATE_SENT_ACKED;
-++    else if (iReturn > 0)
-++      return ADAPTER_MESSAGE_STATE_SENT_NOT_ACKED;
-++    else
-++      return ADAPTER_MESSAGE_STATE_WAITING_TO_BE_SENT;
-++  }
-++
-+   CRPiCECAdapterMessageQueueEntry *entry = new CRPiCECAdapterMessageQueueEntry(this, command);
-+   uint64_t iEntryId(0);
-+   /* add to the wait for ack queue */
-+@@ -192,8 +216,9 @@ cec_adapter_message_state CRPiCECAdapterMessageQueue::Write(const cec_command &c
-+   bRetry = false;
-+   if (iReturn != VCHIQ_SUCCESS)
-+   {
-+-    LIB_CEC->AddLog(CEC_LOG_DEBUG, "sending command '%s' failed (%d)", command.opcode_set ? CCECTypeUtils::ToString(command.opcode) : "POLL", iReturn);
-+-    delete (entry);
-++    LIB_CEC->AddLog(CEC_LOG_DEBUG, "sending command '%s' failed (%d)", CCECTypeUtils::ToString(command.opcode), iReturn);
-++    delete entry;
-++    m_messages.erase(iEntryId);
-+     return ADAPTER_MESSAGE_STATE_ERROR;
-+   }
-+ 
-+@@ -213,12 +238,9 @@ cec_adapter_message_state CRPiCECAdapterMessageQueue::Write(const cec_command &c
-+     }
-+     else
-+     {
-+-      if (command.opcode_set)
-+-      {
-+-        bRetry = true;
-+-        LIB_CEC->AddLog(CEC_LOG_DEBUG, "command '%s' timeout", command.opcode_set ? CCECTypeUtils::ToString(command.opcode) : "POLL");
-+-        sleep(CEC_DEFAULT_TRANSMIT_RETRY_WAIT);
-+-      }
-++      bRetry = true;
-++      LIB_CEC->AddLog(CEC_LOG_DEBUG, "command '%s' timeout", CCECTypeUtils::ToString(command.opcode));
-++      sleep(CEC_DEFAULT_TRANSMIT_RETRY_WAIT);
-+       bReturn = ADAPTER_MESSAGE_STATE_WAITING_TO_BE_SENT;
-+     }
-+ 
-+-- 
-+1.9.1
-+
-diff --git a/tools/depends/target/libcec/Makefile b/tools/depends/target/libcec/Makefile
-index ddf9963..5d1f933 100644
---- a/tools/depends/target/libcec/Makefile
-+++ b/tools/depends/target/libcec/Makefile
-@@ -22,6 +22,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	rm -rf $(PLATFORM); mkdir -p $(PLATFORM)/build
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
- 	cd $(PLATFORM); patch -p1 < ../popcornmix.patch
-+	cd $(PLATFORM); patch -p1 < ../0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
- 	cd $(PLATFORM)/build; $(CMAKE) -DBUILD_SHARED_LIBS=1 -DSKIP_PYTHON_WRAPPER:STRING=1 -DCMAKE_INSTALL_LIBDIR=$(PREFIX)/lib ..
- 
- $(LIBDYLIB): $(PLATFORM)
-
-From 3916ef0e55ad307d7a3e0f88ba5df0cdc73d5477 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 7 May 2015 15:35:43 +0100
-Subject: [PATCH 58/93] ffmpeg: test: increase number of threads
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-index 3498503..c2f3287 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-@@ -265,6 +265,9 @@ bool CDVDVideoCodecFFmpeg::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options
-     else
-     {
-       int num_threads = std::min(8 /*MAX_THREADS*/, g_cpuInfo.getCPUCount());
-+#ifdef TARGET_RASPBERRY_PI
-+      num_threads = num_threads > 1 ? 2 * num_threads : num_threads;
-+#endif
-       if( num_threads > 1)
-         m_pCodecContext->thread_count = num_threads;
-       m_pCodecContext->thread_safe_callbacks = 1;
-
-From 36fd4c27fe9af15d65461e32b8d105e00fd8df52 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 22 May 2015 13:56:29 +0100
-Subject: [PATCH 59/93] ffmpeg: Allow neon to be enabled in unified builds
-
----
- tools/depends/target/ffmpeg/Makefile | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index 6a9f105..fef5ef2 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -23,7 +23,11 @@ ffmpg_config += --enable-gnutls
- ffmpg_config += --enable-libdcadec
- 
- ifeq ($(CROSS_COMPILING), yes)
-+ ifeq ($(CPU), cortex-a7)
-+  ffmpg_config += --arch=arm --enable-cross-compile
-+ else
-   ffmpg_config += --arch=$(CPU) --enable-cross-compile
-+ endif
- endif
- ifeq ($(OS), linux)
-   ffmpg_config += --target-os=$(OS) --cpu=$(CPU)
-
-From 5d5698967a69035d742d55f8986bce84831e73e9 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 5 Mar 2015 20:00:59 +0000
-Subject: [PATCH 60/93] [ffmpmeg] Discard data before VO/VOL in mpeg-4 over
- mpegts
-
----
- ...-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch | 69 ++++++++++++++++++++++
- tools/depends/target/ffmpeg/Makefile               |  4 +-
- 2 files changed, 72 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-
-diff --git a/tools/depends/target/ffmpeg/0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch b/tools/depends/target/ffmpeg/0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-new file mode 100644
-index 0000000..eef7385
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-@@ -0,0 +1,69 @@
-+From ff289b3678b3b102f76c0fc0ffc802e3c8026fdb Mon Sep 17 00:00:00 2001
-+From: Deborah Crook <deborah@kynesim.co.uk>
-+Date: Thu, 5 Mar 2015 19:48:43 +0000
-+Subject: [PATCH] Discard data before VO/VOL in mpeg-4 over mpegts
-+
-+---
-+ libavcodec/mpeg4video_parser.c | 26 ++++++++++++++++++++++----
-+ 1 file changed, 22 insertions(+), 4 deletions(-)
-+
-+diff --git a/libavcodec/mpeg4video_parser.c b/libavcodec/mpeg4video_parser.c
-+index aa5e87a..0d8b15a 100644
-+--- a/libavcodec/mpeg4video_parser.c
-++++ b/libavcodec/mpeg4video_parser.c
-+@@ -43,18 +43,32 @@ int ff_mpeg4_find_frame_end(ParseContext *pc, const uint8_t *buf, int buf_size)
-+     state     = pc->state;
-+
-+     i = 0;
-+-    if (!vop_found) {
-++    if (vop_found < 0) {
-++        for (i = 0; i < buf_size; i++) {
-++            state = (state << 8) | buf[i];
-++            if (state >= 0x100 && state <= 0x12f) {
-++                i++;
-++                vop_found = 0;
-++                break;
-++            }
-++        }
-++    }
-++
-++    if (vop_found == 0)
-++        vop_found = 1;
-++
-++    if (vop_found == 1) {
-+         for (i = 0; i < buf_size; i++) {
-+             state = (state << 8) | buf[i];
-+             if (state == 0x1B6) {
-+                 i++;
-+-                vop_found = 1;
-++                vop_found = 2;
-+                 break;
-+             }
-+         }
-+     }
-+
-+-    if (vop_found) {
-++    if (vop_found == 2) {
-+         /* EOF considered as end of frame */
-+         if (buf_size == 0)
-+             return 0;
-+@@ -133,12 +147,16 @@ static int mpeg4video_parse(AVCodecParserContext *s,
-+     ParseContext *pc = s->priv_data;
-+     int next;
-+
-++    if (pc->frame_start_found == 0 && !avctx->extradata)
-++        pc->frame_start_found = -1;
-++
-+     if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
-+         next = buf_size;
-+     } else {
-+         next = ff_mpeg4_find_frame_end(pc, buf, buf_size);
-+
-+-        if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-++        if (pc->frame_start_found < 0 ||
-++            ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-+             *poutbuf      = NULL;
-+             *poutbuf_size = 0;
-+             return buf_size;
-+--
-+2.1.4
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index fef5ef2..e780521 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -1,7 +1,8 @@
- include ../../Makefile.include
- include FFMPEG-VERSION
- DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch \
--  0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-+  0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch \
-+  0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -77,6 +78,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); sed -i".bak" -e "s%pkg_config_default=pkg-config%export PKG_CONFIG_LIBDIR=$(PREFIX)/lib/pkgconfig \&\& pkg_config_default=$(NATIVEPREFIX)/bin/pkg-config%" configure
- 	cd $(PLATFORM); patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
- 	cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-+	cd $(PLATFORM); patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
- 	./configure $(ffmpg_config)
-
-From 4a4b1b0427cfb3116a112d682d10c802a71f913a Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 27 Feb 2015 14:37:27 +0000
-Subject: [PATCH 61/93] ffmpeg: Add some upstream HEVC optimisations
-
----
- tools/depends/target/ffmpeg/Makefile               |    6 +-
- .../added_ARM_NEON_optimized_SAO_patches.patch     | 3328 ++++++++++++++++++++
- ...hevcdsp_ARM_NEON_optimized_epel_functions.patch |  409 +++
- 3 files changed, 3742 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch
- create mode 100644 tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch
-
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index e780521..58ec0eb 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -2,7 +2,8 @@ include ../../Makefile.include
- include FFMPEG-VERSION
- DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch \
-   0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch \
--  0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-+  0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch \
-+  hevcdsp_ARM_NEON_optimized_epel_functions.patch added_ARM_NEON_optimized_SAO_patches.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -79,6 +80,9 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
- 	cd $(PLATFORM); patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
- 	cd $(PLATFORM); patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-+	cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
-+	cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
-+
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
- 	./configure $(ffmpg_config)
-diff --git a/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch b/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch
-new file mode 100644
-index 0000000..792b5fe
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/added_ARM_NEON_optimized_SAO_patches.patch
-@@ -0,0 +1,3328 @@
-+From b0cb307c253d2c9f4b94a54dfc74ddb83af984cc Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Mon, 8 Dec 2014 13:24:40 +0200
-+Subject: [PATCH 1/9] added ARM NEON optimized SAO band offset
-+
-+---
-+ libavcodec/arm/Makefile            |   3 +-
-+ libavcodec/arm/hevcdsp_init_neon.c |  47 +++++++++
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 204 +++++++++++++++++++++++++++++++++++++
-+ 3 files changed, 253 insertions(+), 1 deletion(-)
-+ create mode 100644 libavcodec/arm/hevcdsp_sao_neon.S
-+
-+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-+index 6051ec8..093a2e8 100644
-+--- a/libavcodec/arm/Makefile
-++++ b/libavcodec/arm/Makefile
-+@@ -133,7 +133,8 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-+                                           arm/hevcdsp_deblock_neon.o    \
-+                                           arm/hevcdsp_epel_neon.o       \
-+                                           arm/hevcdsp_idct_neon.o       \
-+-                                          arm/hevcdsp_qpel_neon.o
-++                                          arm/hevcdsp_qpel_neon.o       \
-++                                          arm/hevcdsp_sao_neon.o
-+ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
-+ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
-+                                           arm/rv40dsp_neon.o
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 733ff08..69e2b2c 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -22,6 +22,7 @@
-+ #include "libavutil/arm/cpu.h"
-+ #include "libavcodec/hevcdsp.h"
-+ #include "hevcdsp_arm.h"
-++#include "../bit_depth_template.c"
-+ 
-+ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+@@ -43,6 +44,11 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+ 
-++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++
-+ #define PUT_PIXELS(name) \
-+     void name(int16_t *dst, uint8_t *src, \
-+                                 ptrdiff_t srcstride, int height, \
-+@@ -151,6 +157,44 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
-+     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
-+ }
-+ 
-++static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-++                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
-++{
-++    pixel *dst = (pixel *)_dst;
-++    pixel *src = (pixel *)_src;
-++    int8_t offset_table[32] = { 0 };
-++    int k, y, x;
-++    int shift  = 3; // BIT_DEPTH - 5
-++
-++    stride_src /= sizeof(pixel);
-++    stride_dst /= sizeof(pixel);
-++
-++    for (k = 0; k < 4; k++)
-++        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-++
-++    switch(width){
-++    case 8:
-++        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        break;
-++    case 16:
-++        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        break;
-++    case 32:
-++        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        break;
-++    case 64:
-++        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        break;
-++    default:
-++        for (y = 0; y < height; y++) {
-++            for (x = 0; x < width; x++)
-++                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-++            dst += stride_dst;
-++            src += stride_src;
-++        }
-++    }
-++}
-++
-+ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+ {
-+     if (bit_depth == 8) {
-+@@ -170,6 +214,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-+         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
-+         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
-++        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-++          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-++        }
-+         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
-+         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
-+         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+new file mode 100644
-+index 0000000..1f0ad64
-+--- /dev/null
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -0,0 +1,204 @@
-++/*
-++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-++ *
-++ * This file is part of FFmpeg.
-++ *
-++ * FFmpeg is free software; you can redistribute it and/or
-++ * modify it under the terms of the GNU Lesser General Public
-++ * License as published by the Free Software Foundation; either
-++ * version 2.1 of the License, or (at your option) any later version.
-++ *
-++ * FFmpeg is distributed in the hope that it will be useful,
-++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-++ * Lesser General Public License for more details.
-++ *
-++ * You should have received a copy of the GNU Lesser General Public
-++ * License along with FFmpeg; if not, write to the Free Software
-++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-++ */
-++
-++#include "libavutil/arm/asm.S"
-++#include "neon.S"
-++
-++function ff_hevc_sao_band_w8_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // offset_table
-++        vpush {d8-d15}
-++        vld1.8  {q0, q1}, [r5] // offset table
-++
-++1:      subs    r4, #1
-++        vld1.8   {d24}, [r1], r3
-++        vshr.u8  d16, d24, #3
-++        vtbl.8   d16, {q0, q1}, d16
-++        vmovl.s8 q2, d16
-++        vmovl.u8 q6, d24
-++        vadd.s16 q2, q6
-++        vqmovun.s16 d4, q2
-++        vst1.8  {d4}, [r0], r2
-++        bne    1b
-++
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w16_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // offset_table
-++        vpush {d8-d15}
-++        vld1.8  {q0, q1}, [r5] // offset table
-++
-++1:      subs    r4, #1
-++        vld1.8  {q12}, [r1], r3
-++
-++        vshr.u8   q8, q12, #3
-++
-++        vtbl.8  d16, {q0, q1}, d16
-++        vtbl.8  d17, {q0, q1}, d17
-++
-++        vmovl.s8 q2, d16
-++        vmovl.s8 q3, d17
-++
-++        vmovl.u8 q6, d24
-++        vmovl.u8 q7, d25
-++
-++        vadd.s16 q2, q6
-++        vadd.s16 q3, q7
-++
-++        vqmovun.s16 d4, q2
-++        vqmovun.s16 d5, q3
-++
-++        vstm.8   r0, {q2}
-++        add    r0, r2
-++        bne    1b
-++
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // offset_table
-++        vpush {d8-d15}
-++        vld1.8  {q0, q1}, [r5] // offset table
-++
-++1:      subs    r4, #1
-++        vld1.8  {q12-q13}, [r1], r3
-++
-++        vshr.u8   q8, q12, #3
-++        vshr.u8   q9, q13, #3
-++
-++        vtbl.8  d16, {q0, q1}, d16
-++        vtbl.8  d17, {q0, q1}, d17
-++        vtbl.8  d18, {q0, q1}, d18
-++        vtbl.8  d19, {q0, q1}, d19
-++
-++        vmovl.s8 q2, d16
-++        vmovl.s8 q3, d17 // q8 free
-++        vmovl.s8 q4, d18
-++        vmovl.s8 q5, d19 // q9 free
-++
-++        vmovl.u8 q6, d24
-++        vmovl.u8 q7, d25 // q12 free
-++        vmovl.u8 q8, d26
-++        vmovl.u8 q9, d27 // q13 free
-++
-++        vadd.s16 q2, q6
-++        vadd.s16 q3, q7
-++        vadd.s16 q4, q8
-++        vadd.s16 q5, q9
-++
-++        vqmovun.s16 d4, q2
-++        vqmovun.s16 d5, q3
-++        vqmovun.s16 d6, q4 // q4 free
-++        vqmovun.s16 d7, q5 // q5 free
-++
-++        vst1.8 {q2-q3}, [r0], r2
-++        bne    1b
-++
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // offset_table
-++        vpush {d8-d15}
-++        vld1.8  {q0, q1}, [r5] // offset table
-++
-++1:      subs    r4, #1
-++        vld1.8  {q12-q13}, [r1]!
-++        vld1.8  {q14-q15}, [r1], r3
-++        sub     r1, #32
-++
-++        vshr.u8   q8, q12, #3
-++        vshr.u8   q9, q13, #3
-++        vshr.u8  q10, q14, #3
-++        vshr.u8  q11, q15, #3
-++
-++        vtbl.8  d16, {q0, q1}, d16
-++        vtbl.8  d17, {q0, q1}, d17
-++        vtbl.8  d18, {q0, q1}, d18
-++        vtbl.8  d19, {q0, q1}, d19
-++        vtbl.8  d20, {q0, q1}, d20
-++        vtbl.8  d21, {q0, q1}, d21
-++        vtbl.8  d22, {q0, q1}, d22
-++        vtbl.8  d23, {q0, q1}, d23
-++
-++        vmovl.s8 q2, d16
-++        vmovl.s8 q3, d17 // q8 free
-++        vmovl.s8 q4, d18
-++        vmovl.s8 q5, d19 // q9 free
-++
-++        vmovl.u8 q6, d24
-++        vmovl.u8 q7, d25 // q12 free
-++        vmovl.u8 q8, d26
-++        vmovl.u8 q9, d27 // q13 free
-++
-++        vadd.s16 q2, q6
-++        vadd.s16 q3, q7
-++        vadd.s16 q4, q8
-++        vadd.s16 q5, q9
-++
-++        vqmovun.s16 d4, q2
-++        vqmovun.s16 d5, q3
-++        vqmovun.s16 d6, q4 // q4 free
-++        vqmovun.s16 d7, q5 // q5 free
-++
-++        // free q4 -q9, q12 - q13
-++        vmovl.s8 q4, d20
-++        vmovl.s8 q5, d21 // q10 free
-++        vmovl.s8 q6, d22
-++        vmovl.s8 q7, d23 // q11 free
-++
-++        vmovl.u8  q8, d28
-++        vmovl.u8  q9, d29 // q14 free
-++        vmovl.u8 q10, d30
-++        vmovl.u8 q11, d31 // q15 free
-++
-++        vadd.s16 q4, q8
-++        vadd.s16 q5, q9
-++        vadd.s16 q6, q10
-++        vadd.s16 q7, q11
-++
-++        vqmovun.s16  d8, q4
-++        vqmovun.s16  d9, q5
-++        vqmovun.s16 d10, q6
-++        vqmovun.s16 d11, q7
-++
-++        vstm.8   r0, {q2-q5}
-++        add    r0, r2
-++        bne    1b
-++
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-+-- 
-+2.5.0
-+
-+
-+From 8429b1de64bb871d57651ecfe3b084e2dfe0af51 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Wed, 27 May 2015 18:10:20 +0100
-+Subject: [PATCH 2/9] added NEON optimized sao edge for eo1 width 64
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  47 ++++++++++++
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 147 +++++++++++++++++++++++++++++++++++++
-+ 2 files changed, 194 insertions(+)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 69e2b2c..c7b5404 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -22,6 +22,7 @@
-+ #include "libavutil/arm/cpu.h"
-+ #include "libavcodec/hevcdsp.h"
-+ #include "hevcdsp_arm.h"
-++#include "libavcodec/avcodec.h"
-+ #include "../bit_depth_template.c"
-+ 
-+ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+@@ -48,6 +49,7 @@ void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_d
-+ void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+ void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+ void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ 
-+ #define PUT_PIXELS(name) \
-+     void name(int16_t *dst, uint8_t *src, \
-+@@ -195,6 +197,50 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+     }
-+ }
-+ 
-++#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-++static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-++                                          int16_t *_sao_offset_val, int eo, int width, int height)
-++{
-++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-++    static const int8_t pos[4][2][2] = {
-++        { { -1,  0 }, {  1, 0 } }, // horizontal
-++        { {  0, -1 }, {  0, 1 } }, // vertical
-++        { { -1, -1 }, {  1, 1 } }, // 45 degree
-++        { {  1, -1 }, { -1, 1 } }, // 135 degree
-++    };
-++    int8_t sao_offset_val[8];  // padding of 3 for vld
-++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
-++    pixel *dst = (pixel *)_dst;
-++    pixel *src = (pixel *)_src;
-++    int a_stride, b_stride;
-++    int x, y;
-++
-++    for (x = 0; x < 5; x++) {
-++        sao_offset_val[x] = _sao_offset_val[x];
-++    }
-++
-++    stride_src /= sizeof(pixel);
-++    stride_dst /= sizeof(pixel);
-++
-++    if (eo == 1 && width == 64) {
-++        ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++    } else {
-++        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-++        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-++        for (y = 0; y < height; y++) {
-++            for (x = 0; x < width; x++) {
-++                int diff0         = CMP(src[x], src[x + a_stride]);
-++                int diff1         = CMP(src[x], src[x + b_stride]);
-++                int offset_val    = edge_idx[2 + diff0 + diff1];
-++                dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-++            }
-++            src += stride_src;
-++            dst += stride_dst;
-++        }
-++    }
-++}
-++#undef CMP
-++
-+ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+ {
-+     if (bit_depth == 8) {
-+@@ -216,6 +262,7 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+         c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
-+         for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-+           c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-++          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
-+         }
-+         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
-+         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 1f0ad64..5ec2de9 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -202,3 +202,150 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+         bx lr
-+ endfunc
-+ 
-++function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x02
-++        vpush {d8-d15}
-++1:      subs    r4, #1
-++        // load a
-++        sub     r1, r3
-++        vld1.8  {q0-q1}, [r1]!
-++        vld1.8  {q2-q3}, [r1], r3
-++        sub     r1, #32
-++        // load c
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++        // load b
-++        vld1.8  {q8-q9}, [r1]!
-++        vld1.8  {q10-q11}, [r1], r3
-++        sub     r1, #32
-++
-++        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-++        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-++        vcgt.u8 q13, q5, q1
-++        vcgt.u8 q1,  q1, q5
-++        vcgt.u8 q14, q6, q2
-++        vcgt.u8 q2,  q2, q6
-++        vcgt.u8 q15, q7, q3
-++        vcgt.u8 q3,  q3, q7
-++
-++        vsub.s8 q12, q0, q12 // diff0
-++        vsub.s8 q13, q1, q13
-++        vsub.s8 q14, q2, q14
-++        vsub.s8 q15, q3, q15
-++
-++        vcgt.u8  q0,  q4, q8 // c > b
-++        vcgt.u8  q8,  q8, q4 // b > c
-++        vcgt.u8  q1,  q5, q9
-++        vcgt.u8  q9,  q9, q5
-++        vcgt.u8  q2,  q6, q10
-++        vcgt.u8 q10, q10, q6
-++        vcgt.u8  q3,  q7, q11
-++        vcgt.u8 q11, q11, q7
-++
-++        vsub.s8 q0, q8, q0 // diff1
-++        vsub.s8 q1, q9, q1
-++        vsub.s8 q2, q10, q2
-++        vsub.s8 q3, q11, q3
-++
-++        veor.u8 q8, q8  // zero register
-++        vdup.s8 q9, r6  // 2 to all elements
-++        add     r6, #1
-++        vdup.s8 q10, r6 // 3 to all elements
-++        sub     r6, #1
-++
-++        vadd.s8 q0, q12 //diff0 + diff1
-++        vadd.s8 q1, q13
-++        vadd.s8 q2, q14
-++        vadd.s8 q3, q15
-++
-++        vcgt.s8 q4, q0, q8 // diff0 + diff1 > 0
-++        vcgt.s8 q5, q1, q8
-++        vcgt.s8 q6, q2, q8
-++        vcgt.s8 q7, q3, q8
-++
-++        vclt.s8 q11, q0, q8 // diff0 + diff1 < 0
-++        vclt.s8 q12, q1, q8
-++        vclt.s8 q13, q2, q8
-++        vclt.s8 q14, q3, q8
-++
-++        vadd.s8  q8,  q0, q9  // diff0 + diff1 + 2
-++        vand.8  q15,  q8, q4
-++        vadd.s8  q8,  q0, q10 // diff0 + diff1 + 3
-++        vand.8   q8,  q8, q11
-++        vadd.s8  q0, q15, q8  // offset_idx
-++
-++        vadd.s8  q8,  q1, q9  // diff0 + diff1 + 2
-++        vand.8  q15,  q8, q5
-++        vadd.s8  q8,  q1, q10 // diff0 + diff1 + 3
-++        vand.8   q8,  q8, q12
-++        vadd.s8  q1, q15, q8  // offset_idx
-++
-++        vadd.s8  q8,  q2, q9  // diff0 + diff1 + 2 + 2
-++        vand.8  q15,  q8, q6
-++        vadd.s8  q8,  q2, q10 // diff0 + diff1 + 2 + 3
-++        vand.8   q8,  q8, q13
-++        vadd.s8  q2, q15, q8  // offset_idx
-++
-++        vadd.s8  q8,  q3, q9  // diff0 + diff1 + 2 + 2
-++        vand.8  q15,  q8, q7
-++        vadd.s8  q8,  q3, q10 // diff0 + diff1 + 2 + 3
-++        vand.8   q8,  q8, q14
-++        vadd.s8  q3, q15, q8  // offset_idx
-++        // TODO: load only once
-++        vld1.8   d16, [r5]
-++
-++        vtbl.8   d0, {d16}, d0
-++        vtbl.8   d1, {d16}, d1
-++        vtbl.8   d2, {d16}, d2
-++        vtbl.8   d3, {d16}, d3
-++        vtbl.8   d4, {d16}, d4
-++        vtbl.8   d5, {d16}, d5
-++        vtbl.8   d6, {d16}, d6
-++        vtbl.8   d7, {d16}, d7
-++
-++        // TODO: load only once
-++        // load c again
-++        sub     r1, r3
-++        sub     r1, r3
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++
-++        vmovl.u8   q8, d8
-++        vmovl.u8   q9, d9
-++        vmovl.u8  q10, d10
-++        vmovl.u8  q11, d11
-++        vmovl.u8  q12, d12
-++        vmovl.u8  q13, d13
-++        vmovl.u8  q14, d14
-++        vmovl.u8  q15, d15
-++
-++        vaddw.s8  q8, d0
-++        vaddw.s8  q9, d1
-++        vaddw.s8 q10, d2
-++        vaddw.s8 q11, d3
-++        vaddw.s8 q12, d4
-++        vaddw.s8 q13, d5
-++        vaddw.s8 q14, d6
-++        vaddw.s8 q15, d7
-++
-++        vqmovun.s16  d0, q8
-++        vqmovun.s16  d1, q9
-++        vqmovun.s16  d2, q10
-++        vqmovun.s16  d3, q11
-++        vqmovun.s16  d4, q12
-++        vqmovun.s16  d5, q13
-++        vqmovun.s16  d6, q14
-++        vqmovun.s16  d7, q15
-++
-++        vstm r0, {q0-q3}
-++        add  r0, r2
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-+-- 
-+2.5.0
-+
-+
-+From 402e2bd1c5ad659c757bf9734abe6331904fb9e2 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Tue, 16 Dec 2014 16:28:25 +0200
-+Subject: [PATCH 3/9] Added SAO edge offset for ARM NEON w32 and w64
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  46 +++-
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 510 +++++++++++++++++++++++++++++++------
-+ 2 files changed, 474 insertions(+), 82 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index c7b5404..c32940e 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -49,7 +49,16 @@ void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_d
-+ void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+ void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+ void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++
-++void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++
-++void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-++void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ 
-+ #define PUT_PIXELS(name) \
-+     void name(int16_t *dst, uint8_t *src, \
-+@@ -222,9 +231,40 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+     stride_src /= sizeof(pixel);
-+     stride_dst /= sizeof(pixel);
-+ 
-+-    if (eo == 1 && width == 64) {
-+-        ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+-    } else {
-++    switch (width) {
-++    case 32:
-++        switch(eo) {
-++        case 0:
-++            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 1:
-++            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 2:
-++            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 3:
-++            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        }
-++        break;
-++    case 64:
-++        switch(eo) {
-++        case 0:
-++            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 1:
-++            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 2:
-++            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        case 3:
-++            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-++            break;
-++        }
-++        break;
-++    default:
-+         a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+         b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+         for (y = 0; y < height; y++) {
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 5ec2de9..4687012 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -202,27 +202,7 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+         bx lr
-+ endfunc
-+ 
-+-function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x02
-+-        vpush {d8-d15}
-+-1:      subs    r4, #1
-+-        // load a
-+-        sub     r1, r3
-+-        vld1.8  {q0-q1}, [r1]!
-+-        vld1.8  {q2-q3}, [r1], r3
-+-        sub     r1, #32
-+-        // load c
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-+-        sub     r1, #32
-+-        // load b
-+-        vld1.8  {q8-q9}, [r1]!
-+-        vld1.8  {q10-q11}, [r1], r3
-+-        sub     r1, #32
-+-
-++.macro edge_w64_body
-+         vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-+         vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-+         vcgt.u8 q13, q5, q1
-+@@ -251,69 +231,61 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+         vsub.s8 q2, q10, q2
-+         vsub.s8 q3, q11, q3
-+ 
-+-        veor.u8 q8, q8  // zero register
-+-        vdup.s8 q9, r6  // 2 to all elements
-+-        add     r6, #1
-+-        vdup.s8 q10, r6 // 3 to all elements
-+-        sub     r6, #1
-+-
-+         vadd.s8 q0, q12 //diff0 + diff1
-+         vadd.s8 q1, q13
-+         vadd.s8 q2, q14
-+         vadd.s8 q3, q15
-+ 
-+-        vcgt.s8 q4, q0, q8 // diff0 + diff1 > 0
-+-        vcgt.s8 q5, q1, q8
-+-        vcgt.s8 q6, q2, q8
-+-        vcgt.s8 q7, q3, q8
-+-
-+-        vclt.s8 q11, q0, q8 // diff0 + diff1 < 0
-+-        vclt.s8 q12, q1, q8
-+-        vclt.s8 q13, q2, q8
-+-        vclt.s8 q14, q3, q8
-+-
-+-        vadd.s8  q8,  q0, q9  // diff0 + diff1 + 2
-+-        vand.8  q15,  q8, q4
-+-        vadd.s8  q8,  q0, q10 // diff0 + diff1 + 3
-+-        vand.8   q8,  q8, q11
-+-        vadd.s8  q0, q15, q8  // offset_idx
-+-
-+-        vadd.s8  q8,  q1, q9  // diff0 + diff1 + 2
-+-        vand.8  q15,  q8, q5
-+-        vadd.s8  q8,  q1, q10 // diff0 + diff1 + 3
-+-        vand.8   q8,  q8, q12
-+-        vadd.s8  q1, q15, q8  // offset_idx
-+-
-+-        vadd.s8  q8,  q2, q9  // diff0 + diff1 + 2 + 2
-+-        vand.8  q15,  q8, q6
-+-        vadd.s8  q8,  q2, q10 // diff0 + diff1 + 2 + 3
-+-        vand.8   q8,  q8, q13
-+-        vadd.s8  q2, q15, q8  // offset_idx
-+-
-+-        vadd.s8  q8,  q3, q9  // diff0 + diff1 + 2 + 2
-+-        vand.8  q15,  q8, q7
-+-        vadd.s8  q8,  q3, q10 // diff0 + diff1 + 2 + 3
-+-        vand.8   q8,  q8, q14
-+-        vadd.s8  q3, q15, q8  // offset_idx
-+-        // TODO: load only once
-+-        vld1.8   d16, [r5]
-+-
-+-        vtbl.8   d0, {d16}, d0
-+-        vtbl.8   d1, {d16}, d1
-+-        vtbl.8   d2, {d16}, d2
-+-        vtbl.8   d3, {d16}, d3
-+-        vtbl.8   d4, {d16}, d4
-+-        vtbl.8   d5, {d16}, d5
-+-        vtbl.8   d6, {d16}, d6
-+-        vtbl.8   d7, {d16}, d7
-+-
-+-        // TODO: load only once
-+-        // load c again
-+-        sub     r1, r3
-+-        sub     r1, r3
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-+-        sub     r1, #32
-++        vdup.s8 q9, r6 // 3 to all elements
-++        sub     r6, #1
-++
-++        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-++        vclt.s8 q13, q1, #0
-++        vclt.s8 q14, q2, #0
-++        vclt.s8 q15, q3, #0
-++
-++        vadd.s8  q8,  q0, q9 // diff0 + diff1 + 3
-++        vadd.s8  q10,  q1, q9
-++        vand.8   q12, q8, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-++        vand.8   q13, q10, q13
-++        vadd.s8  q8,  q2, q9
-++        vadd.s8  q10,  q3, q9
-++        vand.8   q14, q8, q14
-++        vand.8   q15, q10, q15
-++
-++        vdup.s8 q9, r6  // 2 to all elements
-++        add     r6, #1
-++
-++        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-++        vadd.s8   q8, q0, q9 // diff0 + diff1 + 2
-++        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vcgt.s8  q10, q1, #0
-++        vadd.s8   q0, q11, q12  // offset_idx
-++
-++        vadd.s8   q8, q1, q9 // diff0 + diff1 + 2
-++        vcgt.s8  q12, q2, #0
-++        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vadd.s8   q8, q2, q9 // diff0 + diff1 + 2
-++        vadd.s8   q1, q11, q13
-++
-++        vand.8   q11, q8, q12 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vcgt.s8  q10, q3, #0
-++        vadd.s8   q2, q11, q14
-++
-++        vadd.s8   q8, q3, q9 // diff0 + diff1 + 2
-++        vmov.32  d18[0], r7  // load offset table from general registers
-++        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vmov.32  d18[1], r5  // load rest of offset table
-++        vadd.s8   q3, q11, q15
-++
-++        vtbl.8   d0, {d18}, d0
-++        vtbl.8   d1, {d18}, d1
-++        vtbl.8   d2, {d18}, d2
-++        vtbl.8   d3, {d18}, d3
-++        vtbl.8   d4, {d18}, d4
-++        vtbl.8   d5, {d18}, d5
-++        vtbl.8   d6, {d18}, d6
-++        vtbl.8   d7, {d18}, d7
-+ 
-+         vmovl.u8   q8, d8
-+         vmovl.u8   q9, d9
-+@@ -344,8 +316,388 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+ 
-+         vstm r0, {q0-q3}
-+         add  r0, r2
-++.endm
-++
-++.macro edge_w32_body
-++        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-++        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-++        vcgt.u8 q13, q5, q1
-++        vcgt.u8 q1,  q1, q5
-++
-++        vsub.s8 q12, q0, q12 // diff0
-++        vcgt.u8  q0,  q4, q8 // c > b
-++        vsub.s8 q13, q1, q13 // diff0 part 2
-++
-++        vcgt.u8  q6,  q8, q4 // b > c
-++        vcgt.u8  q1,  q5, q9
-++        vcgt.u8  q7,  q9, q5
-++
-++        vsub.s8 q0, q6, q0 // diff1
-++        vsub.s8 q1, q7, q1 // diff1 part 2
-++        vadd.s8 q0, q12 //diff0 + diff1
-++
-++        vdup.s8 q7, r6 // 3 to all elements
-++        sub     r6, #1
-++        vadd.s8 q1, q13
-++
-++        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-++        vclt.s8 q13, q1, #0
-++
-++        vadd.s8  q6,  q0, q7 // diff0 + diff1 + 3
-++        vadd.s8  q10,  q1, q7
-++        vdup.s8 q7, r6  // 2 to all elements
-++        add     r6, #1
-++        vand.8   q12, q6, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-++        vand.8   q13, q10, q13
-++
-++
-++        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-++        vadd.s8   q6, q0, q7 // diff0 + diff1 + 2
-++        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vcgt.s8  q10, q1, #0
-++        vadd.s8   q0, q11, q12  // offset_idx
-++
-++        vadd.s8   q6, q1, q7 // diff0 + diff1 + 2
-++        vmov.32  d14[0], r7  // load offset table from general registers
-++        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-++        vmov.32  d14[1], r5  // load rest of offset table
-++        vadd.s8   q1, q11, q13
-++
-++        vtbl.8   d0, {d14}, d0
-++        vtbl.8   d1, {d14}, d1
-++        vtbl.8   d2, {d14}, d2
-++        vtbl.8   d3, {d14}, d3
-++
-++        vmovl.u8   q6, d8
-++        vmovl.u8   q7, d9
-++        vmovl.u8  q10, d10
-++        vmovl.u8  q11, d11
-++
-++        vaddw.s8  q6, d0
-++        vaddw.s8  q7, d1
-++        vaddw.s8 q10, d2
-++        vaddw.s8 q11, d3
-++
-++        vqmovun.s16  d0, q6
-++        vqmovun.s16  d1, q7
-++        vqmovun.s16  d2, q10
-++        vqmovun.s16  d3, q11
-++
-++        vstm r0, {q0-q1}
-++        add  r0, r2
-++.endm
-++
-++function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        sub    r1, #8
-++1:      subs    r4, #1
-++        vld1.64  {q10-q11}, [r1]!
-++        vld1.64  {q12-q13}, [r1]!
-++        vld1.64  {q14}, [r1], r3
-++        sub      r1, #64
-++        // load a
-++        vext.8 q0, q10, q11, #7
-++        vext.8 q1, q11, q12, #7
-++        vext.8 q2, q12, q13, #7
-++        vext.8 q3, q13, q14, #7
-++        // load c
-++        vext.8 q4, q10, q11, #8
-++        vext.8 q5, q11, q12, #8
-++        vext.8 q6, q12, q13, #8
-++        vext.8 q7, q13, q14, #8
-++        // load b
-++        vext.8 q8, q10, q11, #9
-++        vext.8 q9, q11, q12, #9
-++        vext.8 q10, q12, q13, #9
-++        vext.8 q11, q13, q14, #9
-++        edge_w64_body
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        sub     r1, r3
-++        // load a
-++        vld1.8  {q0-q1}, [r1]!
-++        vld1.8  {q2-q3}, [r1], r3
-++        sub     r1, #32
-++1:      subs    r4, #1
-++        // load c
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++        // load b
-++        vld1.8  {q8-q9}, [r1]!
-++        vld1.8  {q10-q11}, [r1]
-++        sub     r1, #32
-++        edge_w64_body
-++        // copy c to a
-++        vmov.64 q0, q4
-++        vmov.64 q1, q5
-++        vmov.64 q2, q6
-++        vmov.64 q3, q7
-+         bne   1b
-+         vpop  {d8-d15}
-+         pop   {r4-r8}
-+         bx lr
-+ endfunc
-++
-++function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++1:      sub     r1, r3
-++        // load a
-++        // TODO: fix unaligned load
-++        //       don't reload a like in eo1
-++        sub     r1, #1
-++        vld1.8  {q0-q1}, [r1]!
-++        vld1.8  {q2-q3}, [r1], r3
-++        sub     r1, #31
-++        subs    r4, #1
-++        // load c
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++        // load b
-++        add     r1, #1
-++        vld1.8  {q8-q9}, [r1]!
-++        vld1.8  {q10-q11}, [r1]
-++        sub     r1, #33
-++        edge_w64_body
-++        // copy c to a
-++        vmov.64 q0, q4
-++        vmov.64 q1, q5
-++        vmov.64 q2, q6
-++        vmov.64 q3, q7
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++1:      sub     r1, r3
-++        // load a
-++        // TODO: fix unaligned load
-++        //       don't reload a like in eo1
-++        add     r1, #1
-++        vld1.8  {q0-q1}, [r1]!
-++        vld1.8  {q2-q3}, [r1], r3
-++        sub     r1, #33
-++        subs    r4, #1
-++        // load c
-++        vld1.8  {q4-q5}, [r1]!
-++        vld1.8  {q6-q7}, [r1], r3
-++        sub     r1, #32
-++        // load b
-++        sub     r1, #1
-++        vld1.8  {q8-q9}, [r1]!
-++        vld1.8  {q10-q11}, [r1]
-++        sub     r1, #31
-++        edge_w64_body
-++        // copy c to a
-++        vmov.64 q0, q4
-++        vmov.64 q1, q5
-++        vmov.64 q2, q6
-++        vmov.64 q3, q7
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        sub    r1, #8 // load 8 extra bytes
-++1:      subs    r4, #1
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3 // only first 9 bytes are used
-++        sub    r1, #32
-++        // a
-++        vext.8  q0, q10, q11, #7
-++        vext.8  q1, q11, q12, #7
-++        // c
-++        vext.8  q4, q10, q11, #8
-++        vext.8  q5, q11, q12, #8
-++        // b
-++        vext.8  q8, q10, q11, #9
-++        vext.8  q9, q11, q12, #9
-++        edge_w32_body
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        // load a
-++        sub     r1, r3
-++        vld1.8  {q0-q1}, [r1], r3
-++        // load c
-++        vld1.8  {q4-q5}, [r1], r3
-++1:      subs    r4, #1
-++        // load b
-++        vld1.8  {q8-q9}, [r1], r3
-++        edge_w32_body
-++        // inputs for next loop iteration
-++        // a
-++        vmov.64 q0, q4
-++        vmov.64 q1, q5
-++        // c
-++        vmov.64 q4, q8
-++        vmov.64 q5, q9
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        ldr    r5, [r5]
-++        vpush {d8-d15}
-++        // load a
-++        sub     r1, r3
-++        sub    r1, #8
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q0, q10, q11, #7
-++        vext.8  q1, q11, q12, #7
-++        // load c
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q4, q10, q11, #8
-++        vext.8  q5, q11, q12, #8
-++        vext.8  q2, q10, q11, #7
-++1:      subs    r4, #1
-++        // load b
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q8, q10, q11, #9
-++        vext.8  q9, q11, q12, #9
-++        vext.8  q14, q10, q11, #8
-++        vext.8  q15, q11, q12, #8
-++        vext.8  q3, q10, q11, #7
-++        edge_w32_body
-++        // inputs for next loop iteration
-++        // a
-++        vmov.8 q0, q2
-++        vext.8 q1, q4, q5, #15
-++        // c
-++        vmov.8  q4, q14
-++        vmov.8  q5, q15
-++        vmov.8  q2, q3
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-++        push  {r4-r8}
-++        ldr    r4, [sp, #20] // height
-++        ldr    r5, [sp, #24] // sao_offset_val_table
-++        ldr    r6, =0x03
-++        ldr    r7, [r5]
-++        add    r5, #4
-++        sub    r1, r3
-++        ldr    r5, [r5]
-++        sub    r1, #8
-++        vpush {d8-d15}
-++        // load a
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q0, q10, q11, #9
-++        vext.8  q1, q11, q12, #9
-++        // load c
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q4, q10, q11, #8
-++        vext.8  q5, q11, q12, #8
-++        vext.8  q2, q12, q11, #8
-++1:      subs    r4, #1
-++        // load b
-++        vld1.8  {q10-q11}, [r1]
-++        add    r1, #32
-++        vld1.8  {q12}, [r1], r3
-++        sub    r1, #32
-++        vext.8  q8, q10, q11, #7
-++        vext.8  q9, q11, q12, #7
-++        vext.8  q3, q12, q10, #7
-++        edge_w32_body
-++        // inputs for next loop iteration
-++        // a
-++        vext.8 q0, q4, q5, #1
-++        vext.8 q1, q5, q2, #1
-++        // c
-++        vext.8  q4, q8, q9, #1
-++        vext.8  q5, q9, q3, #1
-++        vext.8  q2, q3, q1, #1
-++        bne   1b
-++        vpop  {d8-d15}
-++        pop   {r4-r8}
-++        bx lr
-++endfunc
-++
-+-- 
-+2.5.0
-+
-+
-+From 1898d052a73370166d57e17cc7c52b7275887df3 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Fri, 19 Dec 2014 09:44:10 +0200
-+Subject: [PATCH 4/9] Improved SAO band NEON opimizations made SAO buffer 16
-+ byte aligned added alignment hints to loads and stores optimized register
-+ usage in SAO band neon assembly
-+
-+---
-+ libavcodec/arm/hevcdsp_sao_neon.S | 212 +++++++++++++++-----------------------
-+ 1 file changed, 82 insertions(+), 130 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 4687012..ac21013 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -22,120 +22,84 @@
-+ #include "neon.S"
-+ 
-+ function ff_hevc_sao_band_w8_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // offset_table
-+-        vpush {d8-d15}
-+-        vld1.8  {q0, q1}, [r5] // offset table
-++        ldr      r12, [sp, #4]    // offset_table address
-++        vld1.8   {q0, q1}, [r12]  // offset table
-++        ldr      r12, [sp, #0]    // height
-+ 
-+-1:      subs    r4, #1
-+-        vld1.8   {d24}, [r1], r3
-++1:      subs     r12, #1
-++        vld1.8   {d24}, [r1,:64], r3
-+         vshr.u8  d16, d24, #3
-+         vtbl.8   d16, {q0, q1}, d16
-+-        vmovl.s8 q2, d16
-+         vmovl.u8 q6, d24
-+-        vadd.s16 q2, q6
-++        vaddw.s8 q6, d16
-+         vqmovun.s16 d4, q2
-+-        vst1.8  {d4}, [r0], r2
-++        vst1.8  {d4}, [r0,:64], r2
-+         bne    1b
-+ 
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_band_w16_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // offset_table
-+-        vpush {d8-d15}
-+-        vld1.8  {q0, q1}, [r5] // offset table
-+-
-+-1:      subs    r4, #1
-+-        vld1.8  {q12}, [r1], r3
-++        ldr      r12, [sp, #4]    // offset_table address
-++        vld1.8   {q0, q1}, [r12]  // offset table
-++        ldr      r12, [sp, #0]    // height
-+ 
-++1:      subs     r12, #1
-++        vld1.8  {q12}, [r1,:128], r3
-+         vshr.u8   q8, q12, #3
-+-
-+         vtbl.8  d16, {q0, q1}, d16
-+         vtbl.8  d17, {q0, q1}, d17
-+-
-+-        vmovl.s8 q2, d16
-+-        vmovl.s8 q3, d17
-+-
-+-        vmovl.u8 q6, d24
-+-        vmovl.u8 q7, d25
-+-
-+-        vadd.s16 q2, q6
-+-        vadd.s16 q3, q7
-+-
-+-        vqmovun.s16 d4, q2
-+-        vqmovun.s16 d5, q3
-+-
-+-        vstm.8   r0, {q2}
-+-        add    r0, r2
-++        vmovl.u8 q10, d24
-++        vmovl.u8 q11, d25
-++        vaddw.s8 q10, d16
-++        vaddw.s8 q11, d17
-++        vqmovun.s16 d4, q10
-++        vqmovun.s16 d5, q11
-++        vst1.8   {q2}, [r0,:128], r2
-+         bne    1b
-+ 
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_band_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // offset_table
-+-        vpush {d8-d15}
-+-        vld1.8  {q0, q1}, [r5] // offset table
-+-
-+-1:      subs    r4, #1
-+-        vld1.8  {q12-q13}, [r1], r3
-+-
-+-        vshr.u8   q8, q12, #3
-+-        vshr.u8   q9, q13, #3
-+-
-+-        vtbl.8  d16, {q0, q1}, d16
-+-        vtbl.8  d17, {q0, q1}, d17
-+-        vtbl.8  d18, {q0, q1}, d18
-+-        vtbl.8  d19, {q0, q1}, d19
-+-
-+-        vmovl.s8 q2, d16
-+-        vmovl.s8 q3, d17 // q8 free
-+-        vmovl.s8 q4, d18
-+-        vmovl.s8 q5, d19 // q9 free
-+-
-+-        vmovl.u8 q6, d24
-+-        vmovl.u8 q7, d25 // q12 free
-+-        vmovl.u8 q8, d26
-+-        vmovl.u8 q9, d27 // q13 free
-+-
-+-        vadd.s16 q2, q6
-+-        vadd.s16 q3, q7
-+-        vadd.s16 q4, q8
-+-        vadd.s16 q5, q9
-+-
-+-        vqmovun.s16 d4, q2
-+-        vqmovun.s16 d5, q3
-+-        vqmovun.s16 d6, q4 // q4 free
-+-        vqmovun.s16 d7, q5 // q5 free
-+-
-+-        vst1.8 {q2-q3}, [r0], r2
-+-        bne    1b
-+-
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        ldr      r12, [sp, #4]    // offset_table address
-++        vld1.8   {q0, q1}, [r12]  // offset table
-++        ldr      r12, [sp, #0]    // height
-++
-++1:      subs     r12, #1
-++        vld1.8   {q2-q3}, [r1,:128], r3
-++        vshr.u8  q8, q2, #3
-++        vshr.u8  q9, q3, #3
-++        vtbl.8   d16, {q0, q1}, d16
-++        vtbl.8   d17, {q0, q1}, d17
-++        vtbl.8   d18, {q0, q1}, d18
-++        vtbl.8   d19, {q0, q1}, d19
-++        vmovl.u8 q12, d4
-++        vmovl.u8 q13, d5
-++        vmovl.u8 q14, d6
-++        vmovl.u8 q15, d7
-++        vaddw.s8 q12, d16
-++        vaddw.s8 q13, d17
-++        vaddw.s8 q14, d18
-++        vaddw.s8 q15, d19
-++        vqmovun.s16 d4, q12
-++        vqmovun.s16 d5, q13
-++        vqmovun.s16 d6, q14
-++        vqmovun.s16 d7, q15
-++        vst1.8   {q2-q3}, [r0,:128], r2
-++        bne      1b
-++
-++        bx       lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_band_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // offset_table
-+-        vpush {d8-d15}
-+-        vld1.8  {q0, q1}, [r5] // offset table
-++        ldr      r12, [sp, #4]    // offset_table address
-++        vld1.8   {q0, q1}, [r12]  // offset table
-++        ldr      r12, [sp, #0]    // height
-+ 
-+-1:      subs    r4, #1
-+-        vld1.8  {q12-q13}, [r1]!
-+-        vld1.8  {q14-q15}, [r1], r3
-++1:      subs     r12, #1
-++        vld1.8  {q12-q13}, [r1,:128]!
-++        vld1.8  {q14-q15}, [r1,:128], r3
-+         sub     r1, #32
-+ 
-+         vshr.u8   q8, q12, #3
-+@@ -152,53 +116,41 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+         vtbl.8  d22, {q0, q1}, d22
-+         vtbl.8  d23, {q0, q1}, d23
-+ 
-+-        vmovl.s8 q2, d16
-+-        vmovl.s8 q3, d17 // q8 free
-+-        vmovl.s8 q4, d18
-+-        vmovl.s8 q5, d19 // q9 free
-++        vmovl.u8 q2, d24
-++        vmovl.u8 q3, d25
-++        vmovl.u8 q12, d26
-++        vmovl.u8 q13, d27
-+ 
-+-        vmovl.u8 q6, d24
-+-        vmovl.u8 q7, d25 // q12 free
-+-        vmovl.u8 q8, d26
-+-        vmovl.u8 q9, d27 // q13 free
-+-
-+-        vadd.s16 q2, q6
-+-        vadd.s16 q3, q7
-+-        vadd.s16 q4, q8
-+-        vadd.s16 q5, q9
-++        vaddw.s8 q2, d16
-++        vaddw.s8 q3, d17
-++        vaddw.s8 q12, d18
-++        vaddw.s8 q13, d19
-+ 
-+         vqmovun.s16 d4, q2
-+         vqmovun.s16 d5, q3
-+-        vqmovun.s16 d6, q4 // q4 free
-+-        vqmovun.s16 d7, q5 // q5 free
-+-
-+-        // free q4 -q9, q12 - q13
-+-        vmovl.s8 q4, d20
-+-        vmovl.s8 q5, d21 // q10 free
-+-        vmovl.s8 q6, d22
-+-        vmovl.s8 q7, d23 // q11 free
-+-
-+-        vmovl.u8  q8, d28
-+-        vmovl.u8  q9, d29 // q14 free
-+-        vmovl.u8 q10, d30
-+-        vmovl.u8 q11, d31 // q15 free
-+-
-+-        vadd.s16 q4, q8
-+-        vadd.s16 q5, q9
-+-        vadd.s16 q6, q10
-+-        vadd.s16 q7, q11
-+-
-+-        vqmovun.s16  d8, q4
-+-        vqmovun.s16  d9, q5
-+-        vqmovun.s16 d10, q6
-+-        vqmovun.s16 d11, q7
-+-
-+-        vstm.8   r0, {q2-q5}
-+-        add    r0, r2
-++        vqmovun.s16 d6, q12
-++        vqmovun.s16 d7, q13
-++
-++        vmovl.u8 q12, d28
-++        vmovl.u8 q13, d29
-++        vmovl.u8 q14, d30
-++        vmovl.u8 q15, d31
-++
-++        vaddw.s8 q12, d20
-++        vaddw.s8 q13, d21
-++        vaddw.s8 q14, d22
-++        vaddw.s8 q15, d23
-++
-++        vqmovun.s16  d8, q12
-++        vqmovun.s16  d9, q13
-++        vqmovun.s16 d10, q14
-++        vqmovun.s16 d11, q15
-++
-++        vst1.8     {q2-q3}, [r0,:128]!
-++        vst1.8     {q4-q5}, [r0,:128], r2
-++        sub    r0, #32
-+         bne    1b
-+ 
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+         bx lr
-+ endfunc
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 26bd536800db2f50ff6a021e1fda0d0394d1ea01 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Mon, 29 Dec 2014 15:00:49 +0200
-+Subject: [PATCH 5/9] better code reuse in NEON SAO band
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  16 ++--
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 155 +++++++++++++------------------------
-+ 2 files changed, 61 insertions(+), 110 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index c32940e..6379810 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -45,10 +45,10 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+ 
-+-void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+-void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+-void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-+-void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t * offset_table);
-++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+ 
-+ void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+@@ -185,16 +185,16 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+ 
-+     switch(width){
-+     case 8:
-+-        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+     case 16:
-+-        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+     case 32:
-+-        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+     case 64:
-+-        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, height, offset_table);
-++        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+     default:
-+         for (y = 0; y < height; y++) {
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index ac21013..8852550 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -21,53 +21,13 @@
-+ #include "libavutil/arm/asm.S"
-+ #include "neon.S"
-+ 
-+-function ff_hevc_sao_band_w8_neon_8, export=1
-+-        ldr      r12, [sp, #4]    // offset_table address
-++.macro init_sao_band
-++        ldr      r12, [sp, #0]    // offset_table address
-+         vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #0]    // height
-+-
-+-1:      subs     r12, #1
-+-        vld1.8   {d24}, [r1,:64], r3
-+-        vshr.u8  d16, d24, #3
-+-        vtbl.8   d16, {q0, q1}, d16
-+-        vmovl.u8 q6, d24
-+-        vaddw.s8 q6, d16
-+-        vqmovun.s16 d4, q2
-+-        vst1.8  {d4}, [r0,:64], r2
-+-        bne    1b
-+-
-+-        bx lr
-+-endfunc
-+-
-+-function ff_hevc_sao_band_w16_neon_8, export=1
-+-        ldr      r12, [sp, #4]    // offset_table address
-+-        vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #0]    // height
-+-
-+-1:      subs     r12, #1
-+-        vld1.8  {q12}, [r1,:128], r3
-+-        vshr.u8   q8, q12, #3
-+-        vtbl.8  d16, {q0, q1}, d16
-+-        vtbl.8  d17, {q0, q1}, d17
-+-        vmovl.u8 q10, d24
-+-        vmovl.u8 q11, d25
-+-        vaddw.s8 q10, d16
-+-        vaddw.s8 q11, d17
-+-        vqmovun.s16 d4, q10
-+-        vqmovun.s16 d5, q11
-+-        vst1.8   {q2}, [r0,:128], r2
-+-        bne    1b
-+-
-+-        bx lr
-+-endfunc
-+-
-+-function ff_hevc_sao_band_w32_neon_8, export=1
-+-        ldr      r12, [sp, #4]    // offset_table address
-+-        vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #0]    // height
-++        ldr      r12, [sp, #4]    // height
-++.endm
-+ 
-+-1:      subs     r12, #1
-+-        vld1.8   {q2-q3}, [r1,:128], r3
-++.macro sao_band_32
-+         vshr.u8  q8, q2, #3
-+         vshr.u8  q9, q3, #3
-+         vtbl.8   d16, {q0, q1}, d16
-+@@ -86,6 +46,43 @@ function ff_hevc_sao_band_w32_neon_8, export=1
-+         vqmovun.s16 d5, q13
-+         vqmovun.s16 d6, q14
-+         vqmovun.s16 d7, q15
-++.endm
-++
-++function ff_hevc_sao_band_w8_neon_8, export=1
-++        init_sao_band
-++1:      subs     r12, #4
-++        vld1.8   {d4}, [r1,:64], r3
-++        vld1.8   {d5}, [r1,:64], r3
-++        vld1.8   {d6}, [r1,:64], r3
-++        vld1.8   {d7}, [r1,:64], r3
-++        sao_band_32
-++        vst1.8  {d4}, [r0,:64], r2
-++        vst1.8  {d5}, [r0,:64], r2
-++        vst1.8  {d6}, [r0,:64], r2
-++        vst1.8  {d7}, [r0,:64], r2
-++        bne    1b
-++
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w16_neon_8, export=1
-++        init_sao_band
-++1:      subs     r12, #2
-++        vld1.8  {q2}, [r1,:128], r3
-++        vld1.8  {q3}, [r1,:128], r3
-++        sao_band_32
-++        vst1.8   {q2}, [r0,:128], r2
-++        vst1.8   {q3}, [r0,:128], r2
-++        bne    1b
-++
-++        bx lr
-++endfunc
-++
-++function ff_hevc_sao_band_w32_neon_8, export=1
-++        init_sao_band
-++1:      subs     r12, #1
-++        vld1.8   {q2-q3}, [r1,:128], r3
-++        sao_band_32
-+         vst1.8   {q2-q3}, [r0,:128], r2
-+         bne      1b
-+ 
-+@@ -93,63 +90,17 @@ function ff_hevc_sao_band_w32_neon_8, export=1
-+ endfunc
-+ 
-+ function ff_hevc_sao_band_w64_neon_8, export=1
-+-        ldr      r12, [sp, #4]    // offset_table address
-+-        vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #0]    // height
-+-
-+-1:      subs     r12, #1
-+-        vld1.8  {q12-q13}, [r1,:128]!
-+-        vld1.8  {q14-q15}, [r1,:128], r3
-+-        sub     r1, #32
-+-
-+-        vshr.u8   q8, q12, #3
-+-        vshr.u8   q9, q13, #3
-+-        vshr.u8  q10, q14, #3
-+-        vshr.u8  q11, q15, #3
-+-
-+-        vtbl.8  d16, {q0, q1}, d16
-+-        vtbl.8  d17, {q0, q1}, d17
-+-        vtbl.8  d18, {q0, q1}, d18
-+-        vtbl.8  d19, {q0, q1}, d19
-+-        vtbl.8  d20, {q0, q1}, d20
-+-        vtbl.8  d21, {q0, q1}, d21
-+-        vtbl.8  d22, {q0, q1}, d22
-+-        vtbl.8  d23, {q0, q1}, d23
-+-
-+-        vmovl.u8 q2, d24
-+-        vmovl.u8 q3, d25
-+-        vmovl.u8 q12, d26
-+-        vmovl.u8 q13, d27
-+-
-+-        vaddw.s8 q2, d16
-+-        vaddw.s8 q3, d17
-+-        vaddw.s8 q12, d18
-+-        vaddw.s8 q13, d19
-+-
-+-        vqmovun.s16 d4, q2
-+-        vqmovun.s16 d5, q3
-+-        vqmovun.s16 d6, q12
-+-        vqmovun.s16 d7, q13
-+-
-+-        vmovl.u8 q12, d28
-+-        vmovl.u8 q13, d29
-+-        vmovl.u8 q14, d30
-+-        vmovl.u8 q15, d31
-+-
-+-        vaddw.s8 q12, d20
-+-        vaddw.s8 q13, d21
-+-        vaddw.s8 q14, d22
-+-        vaddw.s8 q15, d23
-+-
-+-        vqmovun.s16  d8, q12
-+-        vqmovun.s16  d9, q13
-+-        vqmovun.s16 d10, q14
-+-        vqmovun.s16 d11, q15
-+-
-+-        vst1.8     {q2-q3}, [r0,:128]!
-+-        vst1.8     {q4-q5}, [r0,:128], r2
-+-        sub    r0, #32
-+-        bne    1b
-++        init_sao_band
-++1:      subs      r12, #1
-++        vld1.8    {q2-q3}, [r1,:128]!
-++        sao_band_32
-++        vst1.8    {q2-q3}, [r0,:128]!
-++        vld1.8    {q2-q3}, [r1,:128], r3
-++        sub       r1, #32
-++        sao_band_32
-++        vst1.8    {q2-q3}, [r0,:128], r2
-++        sub       r0, #32
-++        bne       1b
-+ 
-+         bx lr
-+ endfunc
-+-- 
-+2.5.0
-+
-+
-+From f93646a97bc885b81759e774d04be3781916a3e7 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Wed, 7 Jan 2015 15:27:38 +0200
-+Subject: [PATCH 6/9] More SAO NEON optimizations Now uses only 8 bit integers
-+ for SAO calculations
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |   7 +-
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 664 +++++++++++++++----------------------
-+ 2 files changed, 272 insertions(+), 399 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 6379810..8d6e863 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -225,7 +225,7 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+     int x, y;
-+ 
-+     for (x = 0; x < 5; x++) {
-+-        sao_offset_val[x] = _sao_offset_val[x];
-++        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-+     }
-+ 
-+     stride_src /= sizeof(pixel);
-+@@ -271,8 +271,9 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+             for (x = 0; x < width; x++) {
-+                 int diff0         = CMP(src[x], src[x + a_stride]);
-+                 int diff1         = CMP(src[x], src[x + b_stride]);
-+-                int offset_val    = edge_idx[2 + diff0 + diff1];
-+-                dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-++                int idx           = diff0 + diff1;
-++                if (idx)
-++                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
-+             }
-+             src += stride_src;
-+             dst += stride_dst;
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 8852550..5fc482b 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -1,5 +1,5 @@
-+ /*
-+- * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+  *
-+  * This file is part of FFmpeg.
-+  *
-+@@ -23,6 +23,7 @@
-+ 
-+ .macro init_sao_band
-+         ldr      r12, [sp, #0]    // offset_table address
-++        pld      [r1]
-+         vld1.8   {q0, q1}, [r12]  // offset table
-+         ldr      r12, [sp, #4]    // height
-+ .endm
-+@@ -30,36 +31,31 @@
-+ .macro sao_band_32
-+         vshr.u8  q8, q2, #3
-+         vshr.u8  q9, q3, #3
-++        vmov.u8  q14, #128
-+         vtbl.8   d16, {q0, q1}, d16
-+         vtbl.8   d17, {q0, q1}, d17
-+         vtbl.8   d18, {q0, q1}, d18
-+         vtbl.8   d19, {q0, q1}, d19
-+-        vmovl.u8 q12, d4
-+-        vmovl.u8 q13, d5
-+-        vmovl.u8 q14, d6
-+-        vmovl.u8 q15, d7
-+-        vaddw.s8 q12, d16
-+-        vaddw.s8 q13, d17
-+-        vaddw.s8 q14, d18
-+-        vaddw.s8 q15, d19
-+-        vqmovun.s16 d4, q12
-+-        vqmovun.s16 d5, q13
-+-        vqmovun.s16 d6, q14
-+-        vqmovun.s16 d7, q15
-++        vadd.s8  q2, q14
-++        vadd.s8  q3, q14
-++        vqadd.s8 q2, q8
-++        vqadd.s8 q3, q9
-++        vsub.s8  q2, q14
-++        vsub.s8  q3, q14
-+ .endm
-+ 
-+ function ff_hevc_sao_band_w8_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #4
-+-        vld1.8   {d4}, [r1,:64], r3
-+-        vld1.8   {d5}, [r1,:64], r3
-+-        vld1.8   {d6}, [r1,:64], r3
-+-        vld1.8   {d7}, [r1,:64], r3
-++        vld1.8   {d4}, [r1, :64], r3
-++        vld1.8   {d5}, [r1, :64], r3
-++        vld1.8   {d6}, [r1, :64], r3
-++        vld1.8   {d7}, [r1, :64], r3
-+         sao_band_32
-+-        vst1.8  {d4}, [r0,:64], r2
-+-        vst1.8  {d5}, [r0,:64], r2
-+-        vst1.8  {d6}, [r0,:64], r2
-+-        vst1.8  {d7}, [r0,:64], r2
-++        vst1.8  {d4}, [r0, :64], r2
-++        vst1.8  {d5}, [r0, :64], r2
-++        vst1.8  {d6}, [r0, :64], r2
-++        vst1.8  {d7}, [r0, :64], r2
-+         bne    1b
-+ 
-+         bx lr
-+@@ -68,11 +64,11 @@ endfunc
-+ function ff_hevc_sao_band_w16_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #2
-+-        vld1.8  {q2}, [r1,:128], r3
-+-        vld1.8  {q3}, [r1,:128], r3
-++        vld1.8  {q2}, [r1, :128], r3
-++        vld1.8  {q3}, [r1, :128], r3
-+         sao_band_32
-+-        vst1.8   {q2}, [r0,:128], r2
-+-        vst1.8   {q3}, [r0,:128], r2
-++        vst1.8   {q2}, [r0, :128], r2
-++        vst1.8   {q3}, [r0, :128], r2
-+         bne    1b
-+ 
-+         bx lr
-+@@ -81,9 +77,9 @@ endfunc
-+ function ff_hevc_sao_band_w32_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #1
-+-        vld1.8   {q2-q3}, [r1,:128], r3
-++        vld1.8   {q2-q3}, [r1, :128], r3
-+         sao_band_32
-+-        vst1.8   {q2-q3}, [r0,:128], r2
-++        vst1.8   {q2-q3}, [r0, :128], r2
-+         bne      1b
-+ 
-+         bx       lr
-+@@ -92,263 +88,153 @@ endfunc
-+ function ff_hevc_sao_band_w64_neon_8, export=1
-+         init_sao_band
-+ 1:      subs      r12, #1
-+-        vld1.8    {q2-q3}, [r1,:128]!
-++        pld       [r1, r3]
-++        vld1.8    {q2-q3}, [r1, :128]!
-+         sao_band_32
-+-        vst1.8    {q2-q3}, [r0,:128]!
-+-        vld1.8    {q2-q3}, [r1,:128], r3
-++        vst1.8    {q2-q3}, [r0, :128]!
-++        vld1.8    {q2-q3}, [r1, :128], r3
-+         sub       r1, #32
-+         sao_band_32
-+-        vst1.8    {q2-q3}, [r0,:128], r2
-++        vst1.8    {q2-q3}, [r0, :128], r2
-+         sub       r0, #32
-+         bne       1b
-+ 
-+         bx lr
-+ endfunc
-+-
-++// input
-++// a in q0 - q3
-++// c in q4 - q7
-++// b in q8 - q11
-++// offset table in r7 and r5
-++// output in q0 - q3
-++// clobbers q12 - q15
-+ .macro edge_w64_body
-+-        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-+-        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-+-        vcgt.u8 q13, q5, q1
-+-        vcgt.u8 q1,  q1, q5
-+-        vcgt.u8 q14, q6, q2
-+-        vcgt.u8 q2,  q2, q6
-+-        vcgt.u8 q15, q7, q3
-+-        vcgt.u8 q3,  q3, q7
-+-
-+-        vsub.s8 q12, q0, q12 // diff0
-+-        vsub.s8 q13, q1, q13
-+-        vsub.s8 q14, q2, q14
-+-        vsub.s8 q15, q3, q15
-+-
-++        vcgt.u8 q12,  q4, q0 // c > a -> -1 , otherwise 0
-++        vcgt.u8  q0,  q0, q4 // a > c -> -1 , otherwise 0
-++        vcgt.u8 q13,  q5, q1
-++        vcgt.u8  q1,  q1, q5
-++        vsub.s8 q12,  q0, q12 // diff0
-+         vcgt.u8  q0,  q4, q8 // c > b
-+-        vcgt.u8  q8,  q8, q4 // b > c
-++        vsub.s8 q13,  q1, q13
-++
-++        vcgt.u8 q14,  q8, q4 // b > c
-+         vcgt.u8  q1,  q5, q9
-+-        vcgt.u8  q9,  q9, q5
-+-        vcgt.u8  q2,  q6, q10
-+-        vcgt.u8 q10, q10, q6
-+-        vcgt.u8  q3,  q7, q11
-+-        vcgt.u8 q11, q11, q7
-++        vcgt.u8 q15,  q9, q5
-++        vsub.s8  q0, q14, q0 // diff1
-+ 
-+-        vsub.s8 q0, q8, q0 // diff1
-+-        vsub.s8 q1, q9, q1
-+-        vsub.s8 q2, q10, q2
-+-        vsub.s8 q3, q11, q3
-++        vsub.s8  q1, q15, q1
-+ 
-+-        vadd.s8 q0, q12 //diff0 + diff1
-+-        vadd.s8 q1, q13
-+-        vadd.s8 q2, q14
-+-        vadd.s8 q3, q15
-+-
-+-        vdup.s8 q9, r6 // 3 to all elements
-+-        sub     r6, #1
-+-
-+-        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-+-        vclt.s8 q13, q1, #0
-+-        vclt.s8 q14, q2, #0
-+-        vclt.s8 q15, q3, #0
-+-
-+-        vadd.s8  q8,  q0, q9 // diff0 + diff1 + 3
-+-        vadd.s8  q10,  q1, q9
-+-        vand.8   q12, q8, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-+-        vand.8   q13, q10, q13
-+-        vadd.s8  q8,  q2, q9
-+-        vadd.s8  q10,  q3, q9
-+-        vand.8   q14, q8, q14
-+-        vand.8   q15, q10, q15
-+-
-+-        vdup.s8 q9, r6  // 2 to all elements
-+-        add     r6, #1
-+-
-+-        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-+-        vadd.s8   q8, q0, q9 // diff0 + diff1 + 2
-+-        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vcgt.s8  q10, q1, #0
-+-        vadd.s8   q0, q11, q12  // offset_idx
-+-
-+-        vadd.s8   q8, q1, q9 // diff0 + diff1 + 2
-+-        vcgt.s8  q12, q2, #0
-+-        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vadd.s8   q8, q2, q9 // diff0 + diff1 + 2
-+-        vadd.s8   q1, q11, q13
-+-
-+-        vand.8   q11, q8, q12 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vcgt.s8  q10, q3, #0
-+-        vadd.s8   q2, q11, q14
-+-
-+-        vadd.s8   q8, q3, q9 // diff0 + diff1 + 2
-+-        vmov.32  d18[0], r7  // load offset table from general registers
-+-        vand.8   q11, q8, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vmov.32  d18[1], r5  // load rest of offset table
-+-        vadd.s8   q3, q11, q15
-+-
-+-        vtbl.8   d0, {d18}, d0
-+-        vtbl.8   d1, {d18}, d1
-+-        vtbl.8   d2, {d18}, d2
-+-        vtbl.8   d3, {d18}, d3
-+-        vtbl.8   d4, {d18}, d4
-+-        vtbl.8   d5, {d18}, d5
-+-        vtbl.8   d6, {d18}, d6
-+-        vtbl.8   d7, {d18}, d7
-+-
-+-        vmovl.u8   q8, d8
-+-        vmovl.u8   q9, d9
-+-        vmovl.u8  q10, d10
-+-        vmovl.u8  q11, d11
-+-        vmovl.u8  q12, d12
-+-        vmovl.u8  q13, d13
-+-        vmovl.u8  q14, d14
-+-        vmovl.u8  q15, d15
-+-
-+-        vaddw.s8  q8, d0
-+-        vaddw.s8  q9, d1
-+-        vaddw.s8 q10, d2
-+-        vaddw.s8 q11, d3
-+-        vaddw.s8 q12, d4
-+-        vaddw.s8 q13, d5
-+-        vaddw.s8 q14, d6
-+-        vaddw.s8 q15, d7
-+-
-+-        vqmovun.s16  d0, q8
-+-        vqmovun.s16  d1, q9
-+-        vqmovun.s16  d2, q10
-+-        vqmovun.s16  d3, q11
-+-        vqmovun.s16  d4, q12
-+-        vqmovun.s16  d5, q13
-+-        vqmovun.s16  d6, q14
-+-        vqmovun.s16  d7, q15
-+-
-+-        vstm r0, {q0-q3}
-+-        add  r0, r2
-+-.endm
-++        vadd.s8  q0, q12 //diff0 + diff1
-++        vadd.s8  q1, q13
-+ 
-+-.macro edge_w32_body
-+-        vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0
-+-        vcgt.u8 q0,  q0, q4 // a > c -> -1 , otherwise 0
-+-        vcgt.u8 q13, q5, q1
-+-        vcgt.u8 q1,  q1, q5
-++        vcgt.u8 q14,  q6, q2
-++        vcgt.u8  q2,  q2, q6
-++        vcgt.u8 q15,  q7, q3
-++        vcgt.u8  q3,  q3, q7
-+ 
-+-        vsub.s8 q12, q0, q12 // diff0
-+-        vcgt.u8  q0,  q4, q8 // c > b
-+-        vsub.s8 q13, q1, q13 // diff0 part 2
-++        vsub.s8 q14,  q2, q14
-++        vcgt.u8  q2,  q6, q10
-++        vsub.s8 q15,  q3, q15
-+ 
-+-        vcgt.u8  q6,  q8, q4 // b > c
-+-        vcgt.u8  q1,  q5, q9
-+-        vcgt.u8  q7,  q9, q5
-++        vcgt.u8 q12, q10, q6
-++        vcgt.u8  q3,  q7, q11
-++        vcgt.u8 q13, q11, q7
-++        vsub.s8  q2, q12, q2
-++        vsub.s8  q3, q13, q3
-+ 
-+-        vsub.s8 q0, q6, q0 // diff1
-+-        vsub.s8 q1, q7, q1 // diff1 part 2
-+-        vadd.s8 q0, q12 //diff0 + diff1
-++        vmov.s8 q13, #2 // 2 to all elements
-+ 
-+-        vdup.s8 q7, r6 // 3 to all elements
-+-        sub     r6, #1
-+-        vadd.s8 q1, q13
-++        vadd.s8  q2, q14
-++        vadd.s8  q3, q15
-++
-++        vmov.32  d24[0], r4  // load offset table from general registers
-++        vmov.32  d24[1], r5  // load rest of offset table
-+ 
-+-        vclt.s8 q12, q0, #0 // diff0 + diff1 < 0
-+-        vclt.s8 q13, q1, #0
-+-
-+-        vadd.s8  q6,  q0, q7 // diff0 + diff1 + 3
-+-        vadd.s8  q10,  q1, q7
-+-        vdup.s8 q7, r6  // 2 to all elements
-+-        add     r6, #1
-+-        vand.8   q12, q6, q12 // if (diff0 + diff1 < 0) then (diff0 + diff1 + 3) else 0
-+-        vand.8   q13, q10, q13
-+-
-+-
-+-        vcgt.s8  q10, q0, #0 // diff0 + diff1 > 0
-+-        vadd.s8   q6, q0, q7 // diff0 + diff1 + 2
-+-        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vcgt.s8  q10, q1, #0
-+-        vadd.s8   q0, q11, q12  // offset_idx
-+-
-+-        vadd.s8   q6, q1, q7 // diff0 + diff1 + 2
-+-        vmov.32  d14[0], r7  // load offset table from general registers
-+-        vand.8   q11, q6, q10 // if (diff0 + diff1 > 0) then (diff0 + diff1 + 2) else 0
-+-        vmov.32  d14[1], r5  // load rest of offset table
-+-        vadd.s8   q1, q11, q13
-+-
-+-        vtbl.8   d0, {d14}, d0
-+-        vtbl.8   d1, {d14}, d1
-+-        vtbl.8   d2, {d14}, d2
-+-        vtbl.8   d3, {d14}, d3
-+-
-+-        vmovl.u8   q6, d8
-+-        vmovl.u8   q7, d9
-+-        vmovl.u8  q10, d10
-+-        vmovl.u8  q11, d11
-+-
-+-        vaddw.s8  q6, d0
-+-        vaddw.s8  q7, d1
-+-        vaddw.s8 q10, d2
-+-        vaddw.s8 q11, d3
-+-
-+-        vqmovun.s16  d0, q6
-+-        vqmovun.s16  d1, q7
-+-        vqmovun.s16  d2, q10
-+-        vqmovun.s16  d3, q11
-+-
-+-        vstm r0, {q0-q1}
-+-        add  r0, r2
-++        vadd.s8 q0, q13
-++        vadd.s8 q1, q13
-++        vadd.s8 q2, q13
-++        vadd.s8 q3, q13
-++
-++        vmov.u8  q15, #128 // s8 #-128
-++        vtbl.8   d0, {d24}, d0
-++        vtbl.8   d1, {d24}, d1
-++        vtbl.8   d2, {d24}, d2
-++        vtbl.8   d3, {d24}, d3
-++        vtbl.8   d4, {d24}, d4
-++        vtbl.8   d5, {d24}, d5
-++        vtbl.8   d6, {d24}, d6
-++        vtbl.8   d7, {d24}, d7
-++
-++        vadd.s8  q12,  q4, q15
-++        vadd.s8  q13,  q5, q15
-++        vadd.s8  q14,  q6, q15
-++        vadd.s8  q15,  q7, q15
-++        vqadd.s8 q12,  q0
-++        vqadd.s8 q15,  q3
-++        vmov.u8   q3, #128 // s8 #-128
-++        vqadd.s8 q13,  q1
-++        vqadd.s8 q14,  q2
-++        vsub.s8   q0, q12, q3
-++        vsub.s8   q1, q13, q3
-++        vsub.s8   q2, q14, q3
-++        vsub.s8   q3, q15, q3
-++        vst1.8  {q0-q1}, [r0, :128]!
-++        vst1.8  {q2-q3}, [r0, :128], r2
-++        sub     r0, #32
-+ .endm
-+ 
-+-function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-++.macro init_edge_64
-++        push   {r4-r5}
-++        ldr    r12, [sp, #8] // height
-++        ldr    r5, [sp, #12] // sao_offset_val_table
-++        ldr    r4, [r5]
-+         add    r5, #4
-+         ldr    r5, [r5]
-++.endm
-++
-++function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-++        init_edge_64
-+         vpush {d8-d15}
-+         sub    r1, #8
-+-1:      subs    r4, #1
-+-        vld1.64  {q10-q11}, [r1]!
-+-        vld1.64  {q12-q13}, [r1]!
-+-        vld1.64  {q14}, [r1], r3
-+-        sub      r1, #64
-++1:      subs    r12, #1
-++        vld1.64  {d7}, [r1, :64]!
-++        vld1.64  {q4-q5}, [r1, :128]! // load c
-++        vld1.64  {q6-q7}, [r1, :128]!
-++        vld1.64  {d24}, [r1, :64], r3
-++        sub      r1, #72
-+         // load a
-+-        vext.8 q0, q10, q11, #7
-+-        vext.8 q1, q11, q12, #7
-+-        vext.8 q2, q12, q13, #7
-+-        vext.8 q3, q13, q14, #7
-+-        // load c
-+-        vext.8 q4, q10, q11, #8
-+-        vext.8 q5, q11, q12, #8
-+-        vext.8 q6, q12, q13, #8
-+-        vext.8 q7, q13, q14, #8
-++        vext.8 q0, q3, q4, #15
-++        vext.8 q1, q4, q5, #15
-++        vext.8 q2, q5, q6, #15
-++        vext.8 q3, q6, q7, #15
-+         // load b
-+-        vext.8 q8, q10, q11, #9
-+-        vext.8 q9, q11, q12, #9
-+-        vext.8 q10, q12, q13, #9
-+-        vext.8 q11, q13, q14, #9
-++        vext.8 q8, q4, q5, #1
-++        vext.8 q9, q5, q6, #1
-++        vext.8 q10, q6, q7, #1
-++        vext.8 q11, q7, q12, #1
-+         edge_w64_body
-+         bne   1b
-+         vpop  {d8-d15}
-+-        pop   {r4-r8}
-++        pop   {r4-r5}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-++        init_edge_64
-+         vpush {d8-d15}
-+         sub     r1, r3
-+         // load a
-+-        vld1.8  {q0-q1}, [r1]!
-+-        vld1.8  {q2-q3}, [r1], r3
-++        vld1.8  {q0-q1}, [r1, :128]!
-++        vld1.8  {q2-q3}, [r1, :128], r3
-+         sub     r1, #32
-+-1:      subs    r4, #1
-+         // load c
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-++        vld1.8  {q4-q5}, [r1, :128]!
-++        vld1.8  {q6-q7}, [r1, :128], r3
-+         sub     r1, #32
-++1:      subs    r12, #1
-+         // load b
-+-        vld1.8  {q8-q9}, [r1]!
-+-        vld1.8  {q10-q11}, [r1]
-++        vld1.8  {q8-q9}, [r1, :128]!
-++        vld1.8  {q10-q11}, [r1, :128], r3
-+         sub     r1, #32
-+         edge_w64_body
-+         // copy c to a
-+@@ -356,20 +242,19 @@ function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+         vmov.64 q1, q5
-+         vmov.64 q2, q6
-+         vmov.64 q3, q7
-++        // copy b to c
-++        vmov.64 q4, q8
-++        vmov.64 q5, q9
-++        vmov.64 q6, q10
-++        vmov.64 q7, q11
-+         bne   1b
-+         vpop  {d8-d15}
-+-        pop   {r4-r8}
-++        pop   {r4-r5}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-++        init_edge_64
-+         vpush {d8-d15}
-+ 1:      sub     r1, r3
-+         // load a
-+@@ -379,10 +264,10 @@ function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+         vld1.8  {q0-q1}, [r1]!
-+         vld1.8  {q2-q3}, [r1], r3
-+         sub     r1, #31
-+-        subs    r4, #1
-++        subs    r12, #1
-+         // load c
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-++        vld1.8  {q4-q5}, [r1, :128]!
-++        vld1.8  {q6-q7}, [r1, :128], r3
-+         sub     r1, #32
-+         // load b
-+         add     r1, #1
-+@@ -390,25 +275,14 @@ function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+         vld1.8  {q10-q11}, [r1]
-+         sub     r1, #33
-+         edge_w64_body
-+-        // copy c to a
-+-        vmov.64 q0, q4
-+-        vmov.64 q1, q5
-+-        vmov.64 q2, q6
-+-        vmov.64 q3, q7
-+         bne   1b
-+         vpop  {d8-d15}
-+-        pop   {r4-r8}
-++        pop   {r4-r5}
-+         bx lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-++        init_edge_64
-+         vpush {d8-d15}
-+ 1:      sub     r1, r3
-+         // load a
-+@@ -418,10 +292,10 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+         vld1.8  {q0-q1}, [r1]!
-+         vld1.8  {q2-q3}, [r1], r3
-+         sub     r1, #33
-+-        subs    r4, #1
-++        subs    r12, #1
-+         // load c
-+-        vld1.8  {q4-q5}, [r1]!
-+-        vld1.8  {q6-q7}, [r1], r3
-++        vld1.8  {q4-q5}, [r1, :128]!
-++        vld1.8  {q6-q7}, [r1, :128], r3
-+         sub     r1, #32
-+         // load b
-+         sub     r1, #1
-+@@ -429,178 +303,176 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+         vld1.8  {q10-q11}, [r1]
-+         sub     r1, #31
-+         edge_w64_body
-+-        // copy c to a
-+-        vmov.64 q0, q4
-+-        vmov.64 q1, q5
-+-        vmov.64 q2, q6
-+-        vmov.64 q3, q7
-+         bne   1b
-+         vpop  {d8-d15}
-+-        pop   {r4-r8}
-++        pop   {r4-r5}
-+         bx lr
-+ endfunc
-+ 
-++// inputs:
-++// a in q0, q1
-++// c in q2, q3
-++// b in q8, q9
-++// offset table in d31
-++// clobbered registers q0, q1, q10, q11, q12, q13
-++// output q0, q1
-++.macro edge_w32_body
-++        vcgt.u8 q12, q2, q0 // c > a -> -1 , otherwise 0
-++        vcgt.u8 q0,  q0, q2 // a > c -> -1 , otherwise 0
-++        vcgt.u8 q13, q3, q1
-++        vcgt.u8 q1,  q1, q3
-++
-++        vsub.s8 q12, q0, q12 // diff0
-++        vcgt.u8  q0,  q2, q8 // c > b
-++        vsub.s8 q13, q1, q13 // diff0 part 2
-++
-++        vcgt.u8  q10,  q8, q2 // b > c
-++        vcgt.u8  q1,  q3, q9
-++        vcgt.u8  q11,  q9, q3
-++
-++        vsub.s8 q0, q10, q0 // diff1
-++
-++        vmov.s8 q10, #2 // 2 to all elements
-++        vsub.s8 q1, q11, q1 // diff1 part 2
-++        vadd.s8 q0, q12 //diff0 + diff1
-++        vadd.s8 q1, q13
-++
-++        vadd.s8 q0, q10
-++        vadd.s8 q1, q10
-++
-++        vmov.u8  q10, #128
-++        vtbl.8   d0, {d31}, d0
-++        vtbl.8   d1, {d31}, d1
-++        vtbl.8   d2, {d31}, d2
-++        vtbl.8   d3, {d31}, d3
-++
-++        vadd.s8    q11, q2, q10
-++        vadd.s8    q12, q3, q10
-++        vqadd.s8   q11, q0
-++        vqadd.s8   q12, q1
-++        vsub.s8    q0, q11, q10
-++        vsub.s8    q1, q12, q10
-++        vst1.8   {q0-q1}, [r0, :128], r2
-++.endm
-++
-++.macro init_edge_32
-++        ldr     r12, [sp, #4] // sao_offset_val_table
-++        vld1.32 {d31}, [r12]
-++        ldr     r12, [sp] // height
-++.endm
-++
-+ function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-+-        vpush {d8-d15}
-+-        sub    r1, #8 // load 8 extra bytes
-+-1:      subs    r4, #1
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3 // only first 9 bytes are used
-+-        sub    r1, #32
-++        init_edge_32
-++        sub     r1, #4 // load 4 extra bytes
-++1:      subs    r12, #1
-++        vld1.32 d3[1], [r1]!
-++        vld1.8  {q2-q3}, [r1, :128]! // c
-++        vld1.32 d20[0], [r1], r3
-++        sub     r1, #36
-+         // a
-+-        vext.8  q0, q10, q11, #7
-+-        vext.8  q1, q11, q12, #7
-+-        // c
-+-        vext.8  q4, q10, q11, #8
-+-        vext.8  q5, q11, q12, #8
-++        vext.8  q0, q1, q2, #15
-++        vext.8  q1, q2, q3, #15
-+         // b
-+-        vext.8  q8, q10, q11, #9
-+-        vext.8  q9, q11, q12, #9
-++        vext.8  q8, q2, q3, #1
-++        vext.8  q9, q3, q10, #1
-+         edge_w32_body
-+-        bne   1b
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        bne     1b
-++        bx      lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-+-        vpush {d8-d15}
-++        init_edge_32
-+         // load a
-+         sub     r1, r3
-+-        vld1.8  {q0-q1}, [r1], r3
-++        vld1.8  {q0-q1}, [r1, :128], r3
-+         // load c
-+-        vld1.8  {q4-q5}, [r1], r3
-+-1:      subs    r4, #1
-++        vld1.8  {q2-q3}, [r1, :128], r3
-++1:      subs    r12, #1
-+         // load b
-+-        vld1.8  {q8-q9}, [r1], r3
-++        vld1.8  {q8-q9}, [r1, :128], r3
-+         edge_w32_body
-+         // inputs for next loop iteration
-+         // a
-+-        vmov.64 q0, q4
-+-        vmov.64 q1, q5
-++        vmov.64 q0, q2
-++        vmov.64 q1, q3
-+         // c
-+-        vmov.64 q4, q8
-+-        vmov.64 q5, q9
-+-        bne   1b
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        vmov.64 q2, q8
-++        vmov.64 q3, q9
-++        bne     1b
-++        bx      lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        ldr    r5, [r5]
-+-        vpush {d8-d15}
-++        init_edge_32
-++        vpush   {d8-d15}
-+         // load a
-+         sub     r1, r3
-+-        sub    r1, #8
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-++        sub     r1, #8
-++        vld1.8  {q10-q11}, [r1, :64]!
-++        vld1.8  {d24}, [r1, :64], r3
-++        sub     r1, #32
-+         vext.8  q0, q10, q11, #7
-+         vext.8  q1, q11, q12, #7
-+         // load c
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-+-        vext.8  q4, q10, q11, #8
-+-        vext.8  q5, q11, q12, #8
-+-        vext.8  q2, q10, q11, #7
-+-1:      subs    r4, #1
-++        vld1.8  {d9}, [r1, :64]!
-++        vld1.8  {q2-q3}, [r1, :64], r3
-++        sub     r1, #8
-++        vext.8  q4, q4, q2, #15
-++1:      subs    r12, #1
-+         // load b
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-++        vld1.8  {q10-q11}, [r1, :64]!
-++        vld1.8  {q12}, [r1, :64], r3
-++        sub     r1, #32
-+         vext.8  q8, q10, q11, #9
-+         vext.8  q9, q11, q12, #9
-+-        vext.8  q14, q10, q11, #8
-+-        vext.8  q15, q11, q12, #8
-+-        vext.8  q3, q10, q11, #7
-++        vext.8  q6, q10, q11, #8
-++        vext.8  q7, q11, q12, #8
-++        vext.8  q5, q10, q11, #7
-+         edge_w32_body
-+         // inputs for next loop iteration
-+         // a
-+-        vmov.8 q0, q2
-+-        vext.8 q1, q4, q5, #15
-++        vmov.8  q0, q4
-++        vext.8  q1, q2, q3, #15
-+         // c
-+-        vmov.8  q4, q14
-+-        vmov.8  q5, q15
-+-        vmov.8  q2, q3
-+-        bne   1b
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        vmov.8  q2, q6
-++        vmov.8  q3, q7
-++        vmov.8  q4, q5
-++        bne     1b
-++        vpop    {d8-d15}
-++        bx      lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-+-        push  {r4-r8}
-+-        ldr    r4, [sp, #20] // height
-+-        ldr    r5, [sp, #24] // sao_offset_val_table
-+-        ldr    r6, =0x03
-+-        ldr    r7, [r5]
-+-        add    r5, #4
-+-        sub    r1, r3
-+-        ldr    r5, [r5]
-+-        sub    r1, #8
-+-        vpush {d8-d15}
-++        init_edge_32
-++        sub     r1, r3
-+         // load a
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-+-        vext.8  q0, q10, q11, #9
-+-        vext.8  q1, q11, q12, #9
-++        vld1.8  {q10-q11}, [r1, :64]!
-++        vld1.8  {d24}, [r1, :64], r3
-++        sub     r1, #32
-++        vext.8  q0, q10, q11, #1
-++        vext.8  q1, q11, q12, #1
-+         // load c
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-+-        vext.8  q4, q10, q11, #8
-+-        vext.8  q5, q11, q12, #8
-+-        vext.8  q2, q12, q11, #8
-+-1:      subs    r4, #1
-++        vld1.8  {q2-q3}, [r1, :64]!
-++        vld1.8  {d30}, [r1, :64], r3
-++        sub     r1, #40
-++1:      subs    r12, #1
-+         // load b
-+-        vld1.8  {q10-q11}, [r1]
-+-        add    r1, #32
-+-        vld1.8  {q12}, [r1], r3
-+-        sub    r1, #32
-++        vld1.8  {q10-q11}, [r1, :64]!
-++        vld1.8  {q12}, [r1, :64], r3
-++        sub     r1, #32
-+         vext.8  q8, q10, q11, #7
-+         vext.8  q9, q11, q12, #7
-+-        vext.8  q3, q12, q10, #7
-++        vext.8  q14, q12, q10, #7
-+         edge_w32_body
-+         // inputs for next loop iteration
-+         // a
-+-        vext.8 q0, q4, q5, #1
-+-        vext.8 q1, q5, q2, #1
-++        vext.8  q0, q2, q3, #1
-++        vext.8  q1, q3, q15, #1
-+         // c
-+-        vext.8  q4, q8, q9, #1
-+-        vext.8  q5, q9, q3, #1
-+-        vext.8  q2, q3, q1, #1
-+-        bne   1b
-+-        vpop  {d8-d15}
-+-        pop   {r4-r8}
-+-        bx lr
-++        vext.8  q2, q8, q9, #1
-++        vext.8  q3, q9, q14, #1
-++        vext.8  d30, d28, d2, #1
-++        bne     1b
-++        bx      lr
-+ endfunc
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 016c39d46b86830204a4519590332d2a38f7ee51 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Thu, 8 Jan 2015 09:58:55 +0200
-+Subject: [PATCH 7/9] small optimization to SAO BAND. correct path for
-+ bit_depth_template.c
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c | 2 +-
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 2 +-
-+ 2 files changed, 2 insertions(+), 2 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 8d6e863..385c35d 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -23,7 +23,7 @@
-+ #include "libavcodec/hevcdsp.h"
-+ #include "hevcdsp_arm.h"
-+ #include "libavcodec/avcodec.h"
-+-#include "../bit_depth_template.c"
-++#include "libavcodec/bit_depth_template.c"
-+ 
-+ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 5fc482b..710b32b 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -26,12 +26,12 @@
-+         pld      [r1]
-+         vld1.8   {q0, q1}, [r12]  // offset table
-+         ldr      r12, [sp, #4]    // height
-++        vmov.u8  q14, #128
-+ .endm
-+ 
-+ .macro sao_band_32
-+         vshr.u8  q8, q2, #3
-+         vshr.u8  q9, q3, #3
-+-        vmov.u8  q14, #128
-+         vtbl.8   d16, {q0, q1}, d16
-+         vtbl.8   d17, {q0, q1}, d17
-+         vtbl.8   d18, {q0, q1}, d18
-+-- 
-+2.5.0
-+
-+
-+From 579f1584d688e1ac24fb7d22697e2a7b64f62e8e Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Fri, 9 Jan 2015 10:28:52 +0200
-+Subject: [PATCH 8/9] Added height check for SAO NEON optimizations. Faster SAO
-+ band NEON Some reordering to use NEON pipelines more efficiently
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  12 +++-
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 142 ++++++++++++++++++++++---------------
-+ 2 files changed, 93 insertions(+), 61 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 385c35d..6d0689c 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -176,6 +176,7 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+     int8_t offset_table[32] = { 0 };
-+     int k, y, x;
-+     int shift  = 3; // BIT_DEPTH - 5
-++    int cwidth = 0;
-+ 
-+     stride_src /= sizeof(pixel);
-+     stride_dst /= sizeof(pixel);
-+@@ -183,7 +184,10 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+     for (k = 0; k < 4; k++)
-+         offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+ 
-+-    switch(width){
-++    if (height % 8 == 0)
-++        cwidth = width;
-++
-++    switch(cwidth){
-+     case 8:
-+         ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-+         break;
-+@@ -223,15 +227,19 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+     pixel *src = (pixel *)_src;
-+     int a_stride, b_stride;
-+     int x, y;
-++    int cwidth = 0;
-+ 
-+     for (x = 0; x < 5; x++) {
-+         sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-+     }
-+ 
-++    if (height % 8 == 0)
-++        cwidth = width;
-++
-+     stride_src /= sizeof(pixel);
-+     stride_dst /= sizeof(pixel);
-+ 
-+-    switch (width) {
-++    switch (cwidth) {
-+     case 32:
-+         switch(eo) {
-+         case 0:
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 710b32b..08f50b8 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -26,36 +26,59 @@
-+         pld      [r1]
-+         vld1.8   {q0, q1}, [r12]  // offset table
-+         ldr      r12, [sp, #4]    // height
-+-        vmov.u8  q14, #128
-++        vmov.u8  q3, #128
-+ .endm
-+ 
-+-.macro sao_band_32
-+-        vshr.u8  q8, q2, #3
-+-        vshr.u8  q9, q3, #3
-+-        vtbl.8   d16, {q0, q1}, d16
-+-        vtbl.8   d17, {q0, q1}, d17
-+-        vtbl.8   d18, {q0, q1}, d18
-+-        vtbl.8   d19, {q0, q1}, d19
-+-        vadd.s8  q2, q14
-+-        vadd.s8  q3, q14
-+-        vqadd.s8 q2, q8
-+-        vqadd.s8 q3, q9
-+-        vsub.s8  q2, q14
-+-        vsub.s8  q3, q14
-++// 128 in q3
-++// input q8 - q11
-++// 32 cycles
-++.macro sao_band_64
-++        vshr.u8  q12, q8, #3
-++        vshr.u8  q13, q9, #3
-++        vshr.u8  q14, q10, #3
-++        vshr.u8  q15, q11, #3
-++        vtbl.8   d24, {d0, d1, d2, d3}, d24
-++        vadd.s8  q8, q3
-++        vtbl.8   d25, {d0, d1, d2, d3}, d25
-++        vadd.s8  q9, q3
-++        vtbl.8   d26, {d0, d1, d2, d3}, d26
-++        vadd.s8  q10, q3
-++        vtbl.8   d27, {d0, d1, d2, d3}, d27
-++        vadd.s8  q11, q3
-++        vtbl.8   d28, {d0, d1, d2, d3}, d28
-++        vqadd.s8 q8, q12
-++        vtbl.8   d29, {d0, d1, d2, d3}, d29
-++        vqadd.s8 q9, q13
-++        vtbl.8   d30, {d0, d1, d2, d3}, d30
-++        vqadd.s8 q10, q14
-++        vtbl.8   d31, {d0, d1, d2, d3}, d31
-++        vqadd.s8 q11, q15
-++        vsub.s8  q8, q3
-++        vsub.s8  q9, q3
-++        vsub.s8  q10, q3
-++        vsub.s8  q11, q3
-+ .endm
-+ 
-+ function ff_hevc_sao_band_w8_neon_8, export=1
-+         init_sao_band
-+-1:      subs     r12, #4
-+-        vld1.8   {d4}, [r1, :64], r3
-+-        vld1.8   {d5}, [r1, :64], r3
-+-        vld1.8   {d6}, [r1, :64], r3
-+-        vld1.8   {d7}, [r1, :64], r3
-+-        sao_band_32
-+-        vst1.8  {d4}, [r0, :64], r2
-+-        vst1.8  {d5}, [r0, :64], r2
-+-        vst1.8  {d6}, [r0, :64], r2
-+-        vst1.8  {d7}, [r0, :64], r2
-++1:      subs     r12, #8
-++        vld1.8   {d16}, [r1, :64], r3
-++        vld1.8   {d17}, [r1, :64], r3
-++        vld1.8   {d18}, [r1, :64], r3
-++        vld1.8   {d19}, [r1, :64], r3
-++        vld1.8   {d20}, [r1, :64], r3
-++        vld1.8   {d21}, [r1, :64], r3
-++        vld1.8   {d22}, [r1, :64], r3
-++        vld1.8   {d23}, [r1, :64], r3
-++        sao_band_64
-++        vst1.8  {d16}, [r0, :64], r2
-++        vst1.8  {d17}, [r0, :64], r2
-++        vst1.8  {d18}, [r0, :64], r2
-++        vst1.8  {d19}, [r0, :64], r2
-++        vst1.8  {d20}, [r0, :64], r2
-++        vst1.8  {d21}, [r0, :64], r2
-++        vst1.8  {d22}, [r0, :64], r2
-++        vst1.8  {d23}, [r0, :64], r2
-+         bne    1b
-+ 
-+         bx lr
-+@@ -63,12 +86,16 @@ endfunc
-+ 
-+ function ff_hevc_sao_band_w16_neon_8, export=1
-+         init_sao_band
-+-1:      subs     r12, #2
-+-        vld1.8  {q2}, [r1, :128], r3
-+-        vld1.8  {q3}, [r1, :128], r3
-+-        sao_band_32
-+-        vst1.8   {q2}, [r0, :128], r2
-+-        vst1.8   {q3}, [r0, :128], r2
-++1:      subs     r12, #4
-++        vld1.8  {q8}, [r1, :128], r3
-++        vld1.8  {q9}, [r1, :128], r3
-++        vld1.8  {q10}, [r1, :128], r3
-++        vld1.8  {q11}, [r1, :128], r3
-++        sao_band_64
-++        vst1.8   {q8}, [r0, :128], r2
-++        vst1.8   {q9}, [r0, :128], r2
-++        vst1.8   {q10}, [r0, :128], r2
-++        vst1.8   {q11}, [r0, :128], r2
-+         bne    1b
-+ 
-+         bx lr
-+@@ -76,10 +103,12 @@ endfunc
-+ 
-+ function ff_hevc_sao_band_w32_neon_8, export=1
-+         init_sao_band
-+-1:      subs     r12, #1
-+-        vld1.8   {q2-q3}, [r1, :128], r3
-+-        sao_band_32
-+-        vst1.8   {q2-q3}, [r0, :128], r2
-++1:      subs     r12, #2
-++        vld1.8   {q8-q9}, [r1, :128], r3
-++        vld1.8   {q10-q11}, [r1, :128], r3
-++        sao_band_64
-++        vst1.8   {q8-q9}, [r0, :128], r2
-++        vst1.8   {q10-q11}, [r0, :128], r2
-+         bne      1b
-+ 
-+         bx       lr
-+@@ -89,13 +118,12 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+         init_sao_band
-+ 1:      subs      r12, #1
-+         pld       [r1, r3]
-+-        vld1.8    {q2-q3}, [r1, :128]!
-+-        sao_band_32
-+-        vst1.8    {q2-q3}, [r0, :128]!
-+-        vld1.8    {q2-q3}, [r1, :128], r3
-++        vld1.8    {q8-q9}, [r1, :128]!
-++        vld1.8    {q10-q11}, [r1, :128], r3
-+         sub       r1, #32
-+-        sao_band_32
-+-        vst1.8    {q2-q3}, [r0, :128], r2
-++        sao_band_64
-++        vst1.8    {q8-q9}, [r0, :128]!
-++        vst1.8    {q10-q11}, [r0, :128], r2
-+         sub       r0, #32
-+         bne       1b
-+ 
-+@@ -121,7 +149,6 @@ endfunc
-+         vcgt.u8  q1,  q5, q9
-+         vcgt.u8 q15,  q9, q5
-+         vsub.s8  q0, q14, q0 // diff1
-+-
-+         vsub.s8  q1, q15, q1
-+ 
-+         vadd.s8  q0, q12 //diff0 + diff1
-+@@ -157,27 +184,25 @@ endfunc
-+ 
-+         vmov.u8  q15, #128 // s8 #-128
-+         vtbl.8   d0, {d24}, d0
-++        vadd.s8  q13,  q4, q15
-+         vtbl.8   d1, {d24}, d1
-++        vadd.s8  q14,  q5, q15
-+         vtbl.8   d2, {d24}, d2
-++        vqadd.s8 q0, q13
-+         vtbl.8   d3, {d24}, d3
-++        vqadd.s8 q1, q14
-+         vtbl.8   d4, {d24}, d4
-++        vadd.s8  q13,  q6, q15
-+         vtbl.8   d5, {d24}, d5
-++        vadd.s8  q14,  q7, q15
-+         vtbl.8   d6, {d24}, d6
-++        vqadd.s8 q2, q13
-+         vtbl.8   d7, {d24}, d7
-+-
-+-        vadd.s8  q12,  q4, q15
-+-        vadd.s8  q13,  q5, q15
-+-        vadd.s8  q14,  q6, q15
-+-        vadd.s8  q15,  q7, q15
-+-        vqadd.s8 q12,  q0
-+-        vqadd.s8 q15,  q3
-+-        vmov.u8   q3, #128 // s8 #-128
-+-        vqadd.s8 q13,  q1
-+-        vqadd.s8 q14,  q2
-+-        vsub.s8   q0, q12, q3
-+-        vsub.s8   q1, q13, q3
-+-        vsub.s8   q2, q14, q3
-+-        vsub.s8   q3, q15, q3
-++        vqadd.s8 q3, q14
-++        vsub.s8   q0, q15
-++        vsub.s8   q1, q15
-++        vsub.s8   q2, q15
-++        vsub.s8   q3, q15
-+         vst1.8  {q0-q1}, [r0, :128]!
-+         vst1.8  {q2-q3}, [r0, :128], r2
-+         sub     r0, #32
-+@@ -342,13 +367,12 @@ endfunc
-+ 
-+         vmov.u8  q10, #128
-+         vtbl.8   d0, {d31}, d0
-++        vadd.s8  q11, q2, q10
-+         vtbl.8   d1, {d31}, d1
-++        vadd.s8  q12, q3, q10
-+         vtbl.8   d2, {d31}, d2
-++        vqadd.s8 q11, q0
-+         vtbl.8   d3, {d31}, d3
-+-
-+-        vadd.s8    q11, q2, q10
-+-        vadd.s8    q12, q3, q10
-+-        vqadd.s8   q11, q0
-+         vqadd.s8   q12, q1
-+         vsub.s8    q0, q11, q10
-+         vsub.s8    q1, q12, q10
-+-- 
-+2.5.0
-+
-+
-+From 026bac1824e4936e948e6b1efec82868c520ea66 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Mon, 2 Feb 2015 16:08:27 +0200
-+Subject: [PATCH 9/9] Further SAO NEON optimisations
-+
-+---
-+ libavcodec/arm/hevcdsp_init_neon.c |  16 +--
-+ libavcodec/arm/hevcdsp_sao_neon.S  | 224 +++++++++++++++++++------------------
-+ 2 files changed, 124 insertions(+), 116 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 6d0689c..e5da7e9 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -45,10 +45,10 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+ 
-+-void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+-void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+-void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-+-void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int8_t * offset_table, int height);
-++void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-++void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-++void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-++void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+ 
-+ void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+ void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+@@ -189,16 +189,16 @@ static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_
-+ 
-+     switch(cwidth){
-+     case 8:
-+-        ff_hevc_sao_band_w8_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-++        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+         break;
-+     case 16:
-+-        ff_hevc_sao_band_w16_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-++        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+         break;
-+     case 32:
-+-        ff_hevc_sao_band_w32_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-++        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+         break;
-+     case 64:
-+-        ff_hevc_sao_band_w64_neon_8(_dst, _src, stride_dst, stride_src, offset_table, height);
-++        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+         break;
-+     default:
-+         for (y = 0; y < height; y++) {
-+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-+index 08f50b8..9c7808d 100644
-+--- a/libavcodec/arm/hevcdsp_sao_neon.S
-++++ b/libavcodec/arm/hevcdsp_sao_neon.S
-+@@ -22,21 +22,16 @@
-+ #include "neon.S"
-+ 
-+ .macro init_sao_band
-+-        ldr      r12, [sp, #0]    // offset_table address
-+         pld      [r1]
-+-        vld1.8   {q0, q1}, [r12]  // offset table
-+-        ldr      r12, [sp, #4]    // height
-++        vld1.8   {q0, q1}, [r2]  // offset table
-++        ldr       r2, [sp, #0]   // stride_dst
-++        ldr      r12, [sp, #4]   // height
-+         vmov.u8  q3, #128
-+ .endm
-+ 
-+ // 128 in q3
-+ // input q8 - q11
-+-// 32 cycles
-+ .macro sao_band_64
-+-        vshr.u8  q12, q8, #3
-+-        vshr.u8  q13, q9, #3
-+-        vshr.u8  q14, q10, #3
-+-        vshr.u8  q15, q11, #3
-+         vtbl.8   d24, {d0, d1, d2, d3}, d24
-+         vadd.s8  q8, q3
-+         vtbl.8   d25, {d0, d1, d2, d3}, d25
-+@@ -52,8 +47,8 @@
-+         vtbl.8   d30, {d0, d1, d2, d3}, d30
-+         vqadd.s8 q10, q14
-+         vtbl.8   d31, {d0, d1, d2, d3}, d31
-+-        vqadd.s8 q11, q15
-+         vsub.s8  q8, q3
-++        vqadd.s8 q11, q15
-+         vsub.s8  q9, q3
-+         vsub.s8  q10, q3
-+         vsub.s8  q11, q3
-+@@ -64,12 +59,16 @@ function ff_hevc_sao_band_w8_neon_8, export=1
-+ 1:      subs     r12, #8
-+         vld1.8   {d16}, [r1, :64], r3
-+         vld1.8   {d17}, [r1, :64], r3
-++        vshr.u8  q12, q8, #3
-+         vld1.8   {d18}, [r1, :64], r3
-+         vld1.8   {d19}, [r1, :64], r3
-++        vshr.u8  q13, q9, #3
-+         vld1.8   {d20}, [r1, :64], r3
-+         vld1.8   {d21}, [r1, :64], r3
-++        vshr.u8  q14, q10, #3
-+         vld1.8   {d22}, [r1, :64], r3
-+         vld1.8   {d23}, [r1, :64], r3
-++        vshr.u8  q15, q11, #3
-+         sao_band_64
-+         vst1.8  {d16}, [r0, :64], r2
-+         vst1.8  {d17}, [r0, :64], r2
-+@@ -88,9 +87,13 @@ function ff_hevc_sao_band_w16_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #4
-+         vld1.8  {q8}, [r1, :128], r3
-++        vshr.u8  q12, q8, #3
-+         vld1.8  {q9}, [r1, :128], r3
-++        vshr.u8  q13, q9, #3
-+         vld1.8  {q10}, [r1, :128], r3
-++        vshr.u8  q14, q10, #3
-+         vld1.8  {q11}, [r1, :128], r3
-++        vshr.u8  q15, q11, #3
-+         sao_band_64
-+         vst1.8   {q8}, [r0, :128], r2
-+         vst1.8   {q9}, [r0, :128], r2
-+@@ -105,7 +108,11 @@ function ff_hevc_sao_band_w32_neon_8, export=1
-+         init_sao_band
-+ 1:      subs     r12, #2
-+         vld1.8   {q8-q9}, [r1, :128], r3
-++        vshr.u8  q12, q8, #3
-++        vshr.u8  q13, q9, #3
-+         vld1.8   {q10-q11}, [r1, :128], r3
-++        vshr.u8  q14, q10, #3
-++        vshr.u8  q15, q11, #3
-+         sao_band_64
-+         vst1.8   {q8-q9}, [r0, :128], r2
-+         vst1.8   {q10-q11}, [r0, :128], r2
-+@@ -119,7 +126,11 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+ 1:      subs      r12, #1
-+         pld       [r1, r3]
-+         vld1.8    {q8-q9}, [r1, :128]!
-++        vshr.u8  q12, q8, #3
-++        vshr.u8  q13, q9, #3
-+         vld1.8    {q10-q11}, [r1, :128], r3
-++        vshr.u8  q14, q10, #3
-++        vshr.u8  q15, q11, #3
-+         sub       r1, #32
-+         sao_band_64
-+         vst1.8    {q8-q9}, [r0, :128]!
-+@@ -129,51 +140,18 @@ function ff_hevc_sao_band_w64_neon_8, export=1
-+ 
-+         bx lr
-+ endfunc
-+-// input
-+-// a in q0 - q3
-+-// c in q4 - q7
-+-// b in q8 - q11
-+-// offset table in r7 and r5
-+-// output in q0 - q3
-+-// clobbers q12 - q15
-+-.macro edge_w64_body
-+-        vcgt.u8 q12,  q4, q0 // c > a -> -1 , otherwise 0
-+-        vcgt.u8  q0,  q0, q4 // a > c -> -1 , otherwise 0
-+-        vcgt.u8 q13,  q5, q1
-+-        vcgt.u8  q1,  q1, q5
-+-        vsub.s8 q12,  q0, q12 // diff0
-+-        vcgt.u8  q0,  q4, q8 // c > b
-+-        vsub.s8 q13,  q1, q13
-+-
-+-        vcgt.u8 q14,  q8, q4 // b > c
-+-        vcgt.u8  q1,  q5, q9
-+-        vcgt.u8 q15,  q9, q5
-+-        vsub.s8  q0, q14, q0 // diff1
-+-        vsub.s8  q1, q15, q1
-+ 
-+-        vadd.s8  q0, q12 //diff0 + diff1
-+-        vadd.s8  q1, q13
-+-
-+-        vcgt.u8 q14,  q6, q2
-+-        vcgt.u8  q2,  q2, q6
-+-        vcgt.u8 q15,  q7, q3
-+-        vcgt.u8  q3,  q3, q7
-+-
-+-        vsub.s8 q14,  q2, q14
-+-        vcgt.u8  q2,  q6, q10
-+-        vsub.s8 q15,  q3, q15
-+-
-+-        vcgt.u8 q12, q10, q6
-+-        vcgt.u8  q3,  q7, q11
-+-        vcgt.u8 q13, q11, q7
-+-        vsub.s8  q2, q12, q2
-+-        vsub.s8  q3, q13, q3
-++.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
-++        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
-++        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
-++        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
-++        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
-++        vsub.s8 \out0, \tmp0, \out0 // diff0
-++        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
-++.endm
-+ 
-++.macro table64
-+         vmov.s8 q13, #2 // 2 to all elements
-+-
-+-        vadd.s8  q2, q14
-+-        vadd.s8  q3, q15
-+-
-+         vmov.32  d24[0], r4  // load offset table from general registers
-+         vmov.32  d24[1], r5  // load rest of offset table
-+ 
-+@@ -208,6 +186,28 @@ endfunc
-+         sub     r0, #32
-+ .endm
-+ 
-++// input
-++// a in q0 - q3
-++// c in q4 - q7
-++// b in q8 - q11
-++// offset table in r7 and r5
-++// output in q0 - q3
-++// clobbers q12 - q15
-++.macro edge_w64_body
-++        diff32 q12, q13, q0, q1, q0, q1, q4, q5
-++        diff32 q0, q1, q14, q15, q8, q9, q4, q5
-++
-++        vadd.s8  q0, q12 //diff0 + diff1
-++        vadd.s8  q1, q13
-++
-++        diff32  q14, q15, q2, q3, q2, q3, q6, q7
-++        diff32  q2, q3, q12, q13, q10, q11, q6, q7
-++
-++        vadd.s8  q2, q14
-++        vadd.s8  q3, q15
-++        table64
-++.endm
-++
-+ .macro init_edge_64
-+         push   {r4-r5}
-+         ldr    r12, [sp, #8] // height
-+@@ -334,38 +334,23 @@ function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+         bx lr
-+ endfunc
-+ 
-+-// inputs:
-+-// a in q0, q1
-+-// c in q2, q3
-+-// b in q8, q9
-+-// offset table in d31
-+-// clobbered registers q0, q1, q10, q11, q12, q13
-+-// output q0, q1
-+-.macro edge_w32_body
-+-        vcgt.u8 q12, q2, q0 // c > a -> -1 , otherwise 0
-+-        vcgt.u8 q0,  q0, q2 // a > c -> -1 , otherwise 0
-+-        vcgt.u8 q13, q3, q1
-+-        vcgt.u8 q1,  q1, q3
-+-
-+-        vsub.s8 q12, q0, q12 // diff0
-+-        vcgt.u8  q0,  q2, q8 // c > b
-+-        vsub.s8 q13, q1, q13 // diff0 part 2
-+-
-+-        vcgt.u8  q10,  q8, q2 // b > c
-+-        vcgt.u8  q1,  q3, q9
-+-        vcgt.u8  q11,  q9, q3
-+-
-+-        vsub.s8 q0, q10, q0 // diff1
-+-
-+-        vmov.s8 q10, #2 // 2 to all elements
-+-        vsub.s8 q1, q11, q1 // diff1 part 2
-+-        vadd.s8 q0, q12 //diff0 + diff1
-+-        vadd.s8 q1, q13
-++.macro init_edge_32
-++        ldr     r12, [sp, #4] // sao_offset_val_table
-++        vld1.32 {d31}, [r12]
-++        ldr     r12, [sp] // height
-++.endm
-+ 
-+-        vadd.s8 q0, q10
-+-        vadd.s8 q1, q10
-++.macro diff out0, tmp0, in0, in1
-++        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
-++        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
-++        vsub.s8 \out0, \tmp0, \out0 // diff0
-++.endm
-+ 
-+-        vmov.u8  q10, #128
-++.macro table32
-++        vmov.s8  q10, #2
-++        vadd.s8  q0, q10
-++        vadd.s8  q1, q10
-++        vmov.s8  q10, #128
-+         vtbl.8   d0, {d31}, d0
-+         vadd.s8  q11, q2, q10
-+         vtbl.8   d1, {d31}, d1
-+@@ -373,56 +358,68 @@ endfunc
-+         vtbl.8   d2, {d31}, d2
-+         vqadd.s8 q11, q0
-+         vtbl.8   d3, {d31}, d3
-+-        vqadd.s8   q12, q1
-+-        vsub.s8    q0, q11, q10
-+-        vsub.s8    q1, q12, q10
-++        vqadd.s8 q12, q1
-++        vsub.s8  q0, q11, q10
-++        vsub.s8  q1, q12, q10
-+         vst1.8   {q0-q1}, [r0, :128], r2
-+ .endm
-+ 
-+-.macro init_edge_32
-+-        ldr     r12, [sp, #4] // sao_offset_val_table
-+-        vld1.32 {d31}, [r12]
-+-        ldr     r12, [sp] // height
-+-.endm
-+-
-+ function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-+         init_edge_32
-+-        sub     r1, #4 // load 4 extra bytes
-++        vpush {q4-q7}
-++        sub     r1, #4
-+ 1:      subs    r12, #1
-+-        vld1.32 d3[1], [r1]!
-+-        vld1.8  {q2-q3}, [r1, :128]! // c
-+-        vld1.32 d20[0], [r1], r3
-+-        sub     r1, #36
-++        vld1.8  {q13-q14}, [r1]!
-++        vld1.32 d30, [r1], r3
-++        sub     r1, #32
-+         // a
-+-        vext.8  q0, q1, q2, #15
-+-        vext.8  q1, q2, q3, #15
-+-        // b
-+-        vext.8  q8, q2, q3, #1
-+-        vext.8  q9, q3, q10, #1
-+-        edge_w32_body
-++        vext.8   q0, q13, q14, #3
-++        vext.8   q1, q14, q15, #3
-++        vshr.u64 d24, d30, #24
-++        // c
-++        vext.8   q2, q13, q14, #4
-++        vext.8   q3, q14, q15, #4
-++        vshr.u64 d16, d30, #32
-++        // diff0
-++        diff32 q13, q14, q4, q5, q0, q1, q2, q3
-++        diff   d18, d25, d24, d16
-++        // -diff1
-++        vext.s8 q0, q13, q14, #1
-++        vext.s8 q1, q14, q9, #1
-++
-++        vsub.s8 q0, q13, q0 //diff0 + diff1
-++        vsub.s8 q1, q14, q1
-++        table32
-+         bne     1b
-++        vpop {q4-q7}
-++
-+         bx      lr
-+ endfunc
-+ 
-+ function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-+         init_edge_32
-++        vpush {q4-q7}
-+         // load a
-+         sub     r1, r3
-+         vld1.8  {q0-q1}, [r1, :128], r3
-+         // load c
-+         vld1.8  {q2-q3}, [r1, :128], r3
-++        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
-+ 1:      subs    r12, #1
-+         // load b
-+         vld1.8  {q8-q9}, [r1, :128], r3
-+-        edge_w32_body
-+-        // inputs for next loop iteration
-+-        // a
-+-        vmov.64 q0, q2
-+-        vmov.64 q1, q3
-++        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
-++        vadd.s8 q0, q4, q12 //diff0 + diff1
-++        vadd.s8 q1, q5, q13
-++        table32
-++        // CMP ( c, a )
-++        vneg.s8 q12, q4
-++        vneg.s8 q13, q5
-+         // c
-+         vmov.64 q2, q8
-+         vmov.64 q3, q9
-+         bne     1b
-++        vpop {q4-q7}
-+         bx      lr
-+ endfunc
-+ 
-+@@ -452,7 +449,11 @@ function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-+         vext.8  q6, q10, q11, #8
-+         vext.8  q7, q11, q12, #8
-+         vext.8  q5, q10, q11, #7
-+-        edge_w32_body
-++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
-++        vadd.s8 q0, q12 //diff0 + diff1
-++        vadd.s8 q1, q13
-++        table32
-+         // inputs for next loop iteration
-+         // a
-+         vmov.8  q0, q4
-+@@ -487,7 +488,14 @@ function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-+         vext.8  q8, q10, q11, #7
-+         vext.8  q9, q11, q12, #7
-+         vext.8  q14, q12, q10, #7
-+-        edge_w32_body
-++
-++        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-++        diff32 q0, q1, q10, q11, q8, q9, q2, q3
-++
-++        vadd.s8 q0, q12 //diff0 + diff1
-++        vadd.s8 q1, q13
-++        table32
-++
-+         // inputs for next loop iteration
-+         // a
-+         vext.8  q0, q2, q3, #1
-+-- 
-+2.5.0
-+
-diff --git a/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch b/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch
-new file mode 100644
-index 0000000..5e8e07d
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/hevcdsp_ARM_NEON_optimized_epel_functions.patch
-@@ -0,0 +1,409 @@
-+From 29c3327a0d72a7e872ff170363cfe5ed13bca5d0 Mon Sep 17 00:00:00 2001
-+From: Seppo Tomperi <seppo.tomperi@vtt.fi>
-+Date: Tue, 22 Dec 2015 18:10:24 +0000
-+Subject: [PATCH] hevcdsp: ARM NEON optimized epel functions
-+
-+---
-+ libavcodec/arm/Makefile            |   1 +
-+ libavcodec/arm/hevcdsp_epel_neon.S | 334 +++++++++++++++++++++++++++++++++++++
-+ libavcodec/arm/hevcdsp_init_neon.c |  23 +++
-+ 3 files changed, 358 insertions(+)
-+ create mode 100644 libavcodec/arm/hevcdsp_epel_neon.S
-+
-+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-+index cdd35b0..6051ec8 100644
-+--- a/libavcodec/arm/Makefile
-++++ b/libavcodec/arm/Makefile
-+@@ -131,6 +131,7 @@ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
-+                                           arm/synth_filter_neon.o
-+ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-+                                           arm/hevcdsp_deblock_neon.o    \
-++                                          arm/hevcdsp_epel_neon.o       \
-+                                           arm/hevcdsp_idct_neon.o       \
-+                                           arm/hevcdsp_qpel_neon.o
-+ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
-+diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
-+new file mode 100644
-+index 0000000..516ae5b
-+--- /dev/null
-++++ b/libavcodec/arm/hevcdsp_epel_neon.S
-+@@ -0,0 +1,334 @@
-++/*
-++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-++ *
-++ * This file is part of FFmpeg.
-++ *
-++ * FFmpeg is free software; you can redistribute it and/or
-++ * modify it under the terms of the GNU Lesser General Public
-++ * License as published by the Free Software Foundation; either
-++ * version 2.1 of the License, or (at your option) any later version.
-++ *
-++ * FFmpeg is distributed in the hope that it will be useful,
-++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-++ * Lesser General Public License for more details.
-++ *
-++ * You should have received a copy of the GNU Lesser General Public
-++ * License along with FFmpeg; if not, write to the Free Software
-++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-++ */
-++
-++#include "libavutil/arm/asm.S"
-++#include "neon.S"
-++
-++#define MAX_PB_SIZE #64
-++
-++.macro vextin_d4
-++    vld1.8    {q10}, [r1], r2
-++    vmov      d16, d20
-++    vext.8    d17, d20, d21, #1
-++    vext.8    d18, d20, d21, #2
-++    vext.8    d19, d20, d21, #3
-++.endm
-++
-++.macro vextin_d4_8
-++    vld1.8    d16, [r1], r2
-++    vext.8    d17, d16, d16, #1
-++    vext.8    d18, d16, d16, #2
-++    vext.8    d19, d16, d16, #3
-++.endm
-++
-++.macro load_coeffs_16b coeffs
-++    ldr      \coeffs, [\coeffs]
-++    vdup.i8  d0, \coeffs
-++    lsr      \coeffs, #8
-++    vdup.i8  d1, \coeffs
-++    lsr      \coeffs, #8
-++    vdup.i8  d2, \coeffs
-++    lsr      \coeffs, #8
-++    vdup.i8  d3, \coeffs
-++.endm
-++
-++.macro epel_filter_16b out=q12
-++    vmull.u8 q3, d16, d0
-++    vmull.u8 q11, d19, d3
-++    vmull.u8 \out, d17, d1
-++    vmull.u8 q10, d18, d2
-++    vadd.s16 q3, q11
-++    vadd.s16 \out, q10
-++    vsub.s16 \out, q3
-++.endm
-++
-++.macro load_coeffs_32b coeffs
-++    ldr      \coeffs, [\coeffs]
-++    vmov.i64 d4, #0
-++    vmov.8   d4[0], \coeffs
-++    lsr      \coeffs, #8
-++    vmov.8   d4[2], \coeffs
-++    lsr      \coeffs, #8
-++    vmov.8   d4[4], \coeffs
-++    lsr      \coeffs, #8
-++    vmov.8   d4[6], \coeffs
-++.endm
-++
-++.macro epel_filter_32b
-++    vmull.s16 q3, d24, d4[0] //q12
-++    vmull.s16 q4, d25, d4[0]
-++    vmull.s16 q5, d30, d4[3] //q15
-++    vmull.s16 q6, d31, d4[3]
-++
-++    vmull.s16 q7, d26, d4[1] // q13
-++    vmull.s16 q8, d27, d4[1]
-++    vmull.s16 q9, d28, d4[2] // q14
-++    vmull.s16 q10, d29, d4[2]
-++    vadd.s32 q3, q5
-++    vadd.s32 q4, q6
-++    vadd.s32 q7, q9
-++    vadd.s32 q8, q10
-++    vsub.s32 q7, q3
-++    vsub.s32 q8, q4
-++    vqshrn.s32  d6, q7, #6
-++    vqshrn.s32  d7, q8, #6
-++.endm
-++
-++.macro epel_filter_32b_4
-++    vmull.s16 q3, d24, d4[0] //q12
-++    vmull.s16 q5, d30, d4[3] //q15
-++    vmull.s16 q7, d26, d4[1] // q13
-++    vmull.s16 q9, d28, d4[2] // q14
-++    vadd.s32 q3, q5
-++    vadd.s32 q7, q9
-++    vsub.s32 q7, q3
-++    vqshrn.s32  d6, q7, #6
-++.endm
-++
-++function ff_hevc_put_epel_h_neon_8, export=1
-++        push   {r4-r7}
-++        mov    r4, MAX_PB_SIZE
-++        ldr    r7, [sp, #16] // mx
-++        ldr    r5, [sp, #24] // width
-++        sub    r7, #1
-++        lsl    r7, #2
-++        vpush {d8-d15}
-++        adrl   r12, epel_coeffs
-++        add    r7, r12
-++        sub       r1, #1
-++        lsl       r4, #1
-++        load_coeffs_16b r7
-++        mov   r12, r3
-++        mov   r6, r0
-++        mov   r7, r1
-++        cmp       r5, #6
-++        bgt       8f
-++        cmp       r5, #4
-++        blt       2f
-++        b         4f
-++8:      subs r3, #1
-++        pld [r1]
-++        vextin_d4
-++        epel_filter_16b
-++        vst1.16    {q12}, [r0], r4
-++        bne 8b
-++        subs    r5, #8
-++        beq  99f
-++        mov       r3, r12
-++        add       r6, #16
-++        mov       r0, r6
-++        add       r7, #8
-++        mov       r1, r7
-++        cmp       r5, #4
-++        bgt       8b
-++4:      subs r3, #1
-++        pld [r1]
-++        vextin_d4_8
-++        epel_filter_16b
-++        vst1.16    d24, [r0], r4
-++        bne 4b
-++        subs      r5, #4
-++        beq       99f
-++        mov       r3, r12
-++        add       r6, #8
-++        mov       r0, r6
-++        add       r7, #4
-++        mov       r1, r7
-++2:      subs r3, #1
-++        pld [r1]
-++        vextin_d4_8
-++        epel_filter_16b
-++        vst1.32    d24[0], [r0], r4
-++        bne 2b
-++99:     vpop {d8-d15}
-++        pop {r4-r7}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_put_epel_v_neon_8, export=1
-++        push   {r4-r7}
-++        mov    r4, MAX_PB_SIZE
-++        ldr    r7, [sp, #20] // my
-++        ldr    r5, [sp, #24] // width
-++        sub    r7, #1
-++        lsl    r7, #2
-++        vpush {d8-d15}
-++        adrl   r12, epel_coeffs
-++        add    r7, r12
-++        load_coeffs_16b r7
-++        sub       r1, r2
-++        lsl       r4, #1
-++        mov   r12, r3
-++        mov   r6, r0
-++        mov   r7, r1
-++0:      pld [r1]
-++        vld1.8    {d16}, [r1], r2
-++        pld [r1]
-++        vld1.8    {d17}, [r1], r2
-++        pld [r1]
-++        vld1.8    {d18}, [r1], r2
-++        cmp       r5, #6
-++        bgt       8f
-++        cmp       r5, #4
-++        blt       2f
-++        b         4f
-++8:      pld [r1]
-++        vld1.8    {d19}, [r1], r2
-++        subs r3, #1
-++        epel_filter_16b
-++        vst1.16    {q12}, [r0], r4
-++        vmov d16, d17
-++        vmov d17, d18
-++        vmov d18, d19
-++        bne 8b
-++        subs    r5, #8
-++        beq  99f
-++        mov       r3, r12
-++        add       r6, #16
-++        mov       r0, r6
-++        add       r7, #8
-++        mov       r1, r7
-++        b         0b
-++4:      pld       [r1]
-++        vld1.8    {d19}, [r1], r2
-++        subs r3, #1
-++        epel_filter_16b
-++        vst1.16    d24, [r0], r4
-++        vmov d16, d17
-++        vmov d17, d18
-++        vmov d18, d19
-++        bne 4b
-++        subs      r5, #4
-++        beq       99f
-++        mov       r3, r12
-++        add       r6, #8
-++        mov       r0, r6
-++        add       r7, #4
-++        mov       r1, r7
-++        b         0b
-++2:      pld [r1]
-++        vld1.8    {d19}, [r1], r2
-++        subs r3, #1
-++        epel_filter_16b
-++        vst1.32    d24[0], [r0], r4
-++        vmov d16, d17
-++        vmov d17, d18
-++        vmov d18, d19
-++        bne 2b
-++99:     vpop {d8-d15}
-++        pop {r4-r7}
-++        bx lr
-++endfunc
-++
-++function ff_hevc_put_epel_hv_neon_8, export=1
-++        push   {r4-r7}
-++        mov    r4, MAX_PB_SIZE
-++        ldr    r6, [sp, #16] // mx
-++        ldr    r7, [sp, #20] // my
-++        ldr    r5, [sp, #24] // width
-++        sub    r7, #1
-++        lsl    r7, #2
-++        vpush {d8-d15}
-++        adrl   r12, epel_coeffs
-++        sub    r6, #1
-++        lsl    r6, #2
-++        add    r6, r12 // mx epel coeff offset
-++        add    r7, r12
-++        sub       r1, #1
-++        sub       r1, r2
-++        lsl       r4, #1
-++        load_coeffs_16b r6
-++        load_coeffs_32b r7
-++        mov   r12, r3
-++        mov   r6, r0
-++        mov   r7, r1
-++0:      pld   [r1]
-++        vextin_d4
-++        epel_filter_16b q12
-++        pld   [r1]
-++        vextin_d4
-++        epel_filter_16b q13
-++        pld   [r1]
-++        vextin_d4
-++        epel_filter_16b q14
-++        cmp       r5, #6
-++        bgt       8f
-++        cmp       r5, #4
-++        blt       2f
-++        b         4f
-++8:      pld     [r1]
-++        vextin_d4
-++        epel_filter_16b q15
-++        subs r3, #1
-++        epel_filter_32b
-++        vst1.16    {q3}, [r0], r4
-++        vmov q12, q13
-++        vmov q13, q14
-++        vmov q14, q15
-++        bne 8b
-++        subs    r5, #8
-++        beq  99f
-++        mov       r3, r12
-++        add       r6, #16
-++        mov       r0, r6
-++        add       r7, #8
-++        mov       r1, r7
-++        b         0b
-++4:      pld      [r1]
-++        vextin_d4_8
-++        epel_filter_16b q15
-++        subs r3, #1
-++        epel_filter_32b_4
-++        vst1.16    d6, [r0], r4
-++        vmov q12, q13
-++        vmov q13, q14
-++        vmov q14, q15
-++        bne 4b
-++        subs      r5, #4
-++        beq       99f
-++        mov       r3, r12
-++        add       r6, #8
-++        mov       r0, r6
-++        add       r7, #4
-++        mov       r1, r7
-++        b         0b
-++2:      pld      [r1]
-++        vextin_d4_8
-++        epel_filter_16b q15
-++        subs r3, #1
-++        epel_filter_32b_4
-++        vst1.32    d6[0], [r0], r4
-++        vmov q12, q13
-++        vmov q13, q14
-++        vmov q14, q15
-++        bne 2b
-++99:     vpop {d8-d15}
-++        pop {r4-r7}
-++        bx lr
-++endfunc
-++
-++epel_coeffs:
-++       .byte 2, 58, 10, 2
-++       .byte 4, 54, 16, 2
-++       .byte 6, 46, 28, 4
-++       .byte 4, 36, 36, 4
-++       .byte 4, 28, 46, 6
-++       .byte 2, 16, 54, 4
-++       .byte 2, 10, 58, 2
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index 5591807..733ff08 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -58,6 +58,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
-+ PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
-+ PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
-+ #undef PUT_PIXELS
-++void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
-++                                ptrdiff_t srcstride, int height,
-++                                intptr_t mx, intptr_t my, int width);
-++void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
-++                                ptrdiff_t srcstride, int height,
-++                                intptr_t mx, intptr_t my, int width);
-++void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
-++                                ptrdiff_t srcstride, int height,
-++                                intptr_t mx, intptr_t my, int width);
-+ 
-+ static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                    int height, int width);
-+@@ -201,7 +210,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
-+             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
-+             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
-++            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
-++            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
-++            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
-+         }
-++        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
-++        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
-++        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-++        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
-++        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
-++        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
-++        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
-++        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
-++        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
-++        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
-++
-+         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
-+         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
-+         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-+-- 
-+2.5.0
-+
-
-From 51c12471695d2d06c671707a7e2e6fec3b01f538 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 7 May 2015 14:04:18 +0100
-Subject: [PATCH 62/93] [ffmpeg] Add GPU acceleration to hevc
-
----
- tools/depends/target/ffmpeg/Makefile               |     4 +-
- .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 36672 +++++++++++++++++++
- 2 files changed, 36675 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
-
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index 58ec0eb..e4acfa9 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -3,7 +3,8 @@ include FFMPEG-VERSION
- DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_creation.patch \
-   0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch \
-   0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch \
--  hevcdsp_ARM_NEON_optimized_epel_functions.patch added_ARM_NEON_optimized_SAO_patches.patch
-+  hevcdsp_ARM_NEON_optimized_epel_functions.patch added_ARM_NEON_optimized_SAO_patches.patch \
-+  pfcd_hevc_optimisations.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -82,6 +83,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
- 	cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
- 	cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
-+	cd $(PLATFORM); patch -p1 < ../pfcd_hevc_optimisations.patch
- 
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
-diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
-new file mode 100644
-index 0000000..f2b8ffc
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
-@@ -0,0 +1,36672 @@
-+From 5a8f38083c6d9afec5029408c8680b2676752035 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 28 Apr 2015 16:18:40 +0100
-+Subject: [PATCH 01/68] Added display output
-+
-+---
-+ ffmpeg.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-+ 1 file changed, 159 insertions(+)
-+
-+diff --git a/ffmpeg.c b/ffmpeg.c
-+index ce54374..026ffa9 100644
-+--- a/ffmpeg.c
-++++ b/ffmpeg.c
-+@@ -23,6 +23,11 @@
-+  * multimedia converter based on the FFmpeg libraries
-+  */
-+ 
-++#ifdef RPI
-++#define RPI_DISPLAY
-++//#define RPI_ZERO_COPY
-++#endif
-++
-+ #include "config.h"
-+ #include <ctype.h>
-+ #include <string.h>
-+@@ -69,6 +74,20 @@
-+ # include "libavfilter/buffersrc.h"
-+ # include "libavfilter/buffersink.h"
-+ 
-++#ifdef RPI_DISPLAY
-++#include <bcm_host.h>
-++#include <interface/mmal/mmal.h>
-++#include <interface/mmal/mmal_parameters_camera.h>
-++#include <interface/mmal/mmal_buffer.h>
-++#include <interface/mmal/util/mmal_util.h>
-++#include <interface/mmal/util/mmal_default_components.h>
-++#include <interface/mmal/util/mmal_connection.h>
-++#include <interface/mmal/util/mmal_util_params.h>
-++#ifdef RPI_ZERO_COPY
-++#include "libavcodec/rpi_qpu.h"
-++#endif
-++#endif
-++
-+ #if HAVE_SYS_RESOURCE_H
-+ #include <sys/time.h>
-+ #include <sys/types.h>
-+@@ -161,6 +180,134 @@ static int restore_tty;
-+ static void free_input_threads(void);
-+ #endif
-+ 
-++#ifdef RPI_DISPLAY
-++
-++#define NUM_BUFFERS 4
-++
-++static MMAL_COMPONENT_T* rpi_display = NULL;
-++static MMAL_POOL_T *rpi_pool = NULL;
-++
-++#ifdef RPI_ZERO_COPY
-++static uint8_t *get_vc_handle(AVBufferRef *bref) {
-++  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++  return (uint8_t *)p->vc_handle;
-++}
-++#endif
-++
-++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
-++{
-++    MMAL_POOL_T* pool;
-++    size_t i;
-++    size_t size = (w*h*3)/2;
-++#ifdef RPI_ZERO_COPY
-++    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
-++    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
-++    assert(pool);
-++#else
-++    pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
-++
-++    for (i = 0; i < NUM_BUFFERS; ++i)
-++    {
-++       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
-++       void* bufPtr = buffer->data;
-++       memset(bufPtr, i*30, w*h);
-++       memset(bufPtr+w*h, 128, (w*h)/2);
-++    }
-++#endif
-++
-++    return pool;
-++}
-++
-++static void display_cb_input(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
-++  mmal_buffer_header_release(buffer);
-++}
-++
-++static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
-++{
-++    MMAL_COMPONENT_T* display;
-++    int w2 = (w+31)&~31;
-++    int h2 = (h+15)&~15;
-++    MMAL_DISPLAYREGION_T region =
-++    {
-++        {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
-++        .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
-++        .layer = 2,
-++        .fullscreen = 0,
-++        .dest_rect = {x, y, w, h}
-++    };
-++    bcm_host_init();  // TODO is this needed?
-++    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
-++    assert(display);
-++
-++    mmal_port_parameter_set(display->input[0], &region.hdr);
-++
-++    MMAL_ES_FORMAT_T* format = display->input[0]->format;
-++    format->encoding = MMAL_ENCODING_I420;
-++    format->es->video.width = w2;
-++    format->es->video.height = h2;
-++    format->es->video.crop.x = 0;
-++    format->es->video.crop.y = 0;
-++    format->es->video.crop.width = w;
-++    format->es->video.crop.height = h;
-++    mmal_port_format_commit(display->input[0]);
-++
-++    mmal_component_enable(display);
-++
-++    rpi_pool = display_alloc_pool(display->input[0], w2, h2);
-++
-++    mmal_port_enable(display->input[0],display_cb_input);
-++    mmal_port_enable(display->control,display_cb_input);
-++
-++    printf("Allocated display %d %d\n",w,h);
-++
-++    return display;
-++}
-++
-++static void display_frame(MMAL_COMPONENT_T* display,AVFrame* fr)
-++{
-++    int w = fr->width;
-++    int h = fr->height;
-++    int w2 = (w+31)&~31;
-++    int h2 = (h+15)&~15;
-++    if (!display || !rpi_pool)
-++        return;
-++    MMAL_BUFFER_HEADER_T* buf = mmal_queue_get(rpi_pool->queue);
-++    if (!buf) {
-++      // Running too fast so drop the frame
-++      return;
-++    }
-++    assert(buf);
-++    buf->cmd = 0;
-++    buf->length = (w2 * h2 * 3)/2;
-++    buf->offset = 0; // Offset to valid data
-++    buf->flags = 0;
-++#ifdef RPI_ZERO_COPY
-++    buf->data = get_vc_handle(fr->buf[0]);
-++    buf->alloc_size = (w2*h2*3)/2;
-++#else
-++    //mmal_buffer_header_mem_lock(buf);
-++    memcpy(buf->data, fr->data[0], w2 * h);
-++    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
-++    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
-++    //mmal_buffer_header_mem_unlock(buf);
-++#endif
-++
-++    mmal_port_send_buffer(display->input[0], buf);  // I assume this will automatically get released
-++}
-++
-++static void display_exit(MMAL_COMPONENT_T* display)
-++{
-++    if (display) {
-++        mmal_component_destroy(display);
-++    }
-++    if (rpi_pool) {
-++        mmal_port_pool_destroy(display->input[0], rpi_pool);
-++    }
-++}
-++
-++#endif
-++
-++
-+ /* sub2video hack:
-+    Convert subtitles to video with alpha to insert them in filter graphs.
-+    This is a temporary solution until libavfilter gets real subtitles support.
-+@@ -582,6 +729,10 @@ static void ffmpeg_cleanup(int ret)
-+     }
-+     term_exit();
-+     ffmpeg_exited = 1;
-++
-++#ifdef RPI_DISPLAY
-++    display_exit(rpi_display);
-++#endif
-+ }
-+ 
-+ void remove_avoptions(AVDictionary **a, AVDictionary *b)
-+@@ -965,6 +1116,14 @@ static void do_video_out(AVFormatContext *s,
-+     int frame_size = 0;
-+     InputStream *ist = NULL;
-+     AVFilterContext *filter = ost->filter->filter;
-++#ifdef RPI_DISPLAY
-++    if (next_picture)
-++    {
-++	if (!rpi_display)
-++           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
-++        display_frame(rpi_display,next_picture);
-++    }
-++#endif
-+ 
-+     if (ost->source_index >= 0)
-+         ist = input_streams[ost->source_index];
-+-- 
-+2.5.0
-+
-+
-+From a72c0e18e722b541d4bb10f1f5c966f95eccbec1 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 29 Apr 2015 16:49:43 +0100
-+Subject: [PATCH 02/68] Split transform and intra prediction into commands
-+
-+---
-+ libavcodec/hevc.c       | 119 +++++++++++++++++++++++++++++++++++++++++++++++-
-+ libavcodec/hevc.h       |  58 +++++++++++++++++++++++
-+ libavcodec/hevc_cabac.c |  15 ++++++
-+ 3 files changed, 191 insertions(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 5f77761..5566ace 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -918,6 +918,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
-+     return 0;
-+ }
-+ 
-++#ifdef RPI
-++static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
-++{
-++    if (s->enable_rpi) {
-++        HEVCLocalContext *lc = s->HEVClc;
-++        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-++        cmd->type = RPI_PRED_INTRA;
-++        cmd->size = log2_trafo_size;
-++        cmd->c_idx = c_idx;
-++        cmd->x = x0;
-++        cmd->y = y0;
-++        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
-++        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-++    } else {
-++        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
-++    }
-++}
-++#endif
-++
-+ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                               int xBase, int yBase, int cb_xBase, int cb_yBase,
-+                               int log2_cb_size, int log2_trafo_size,
-+@@ -930,8 +949,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+     if (lc->cu.pred_mode == MODE_INTRA) {
-+         int trafo_size = 1 << log2_trafo_size;
-+         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
-+-
-++#ifdef RPI
-++        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
-++#else
-+         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
-++#endif
-+     }
-+ 
-+     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-+@@ -1017,7 +1039,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-+                 if (lc->cu.pred_mode == MODE_INTRA) {
-+                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
-++#else
-+                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
-++#endif
-+                 }
-+                 if (cbf_cb[i])
-+                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-+@@ -1046,7 +1072,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-+                 if (lc->cu.pred_mode == MODE_INTRA) {
-+                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
-++#else
-+                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
-++#endif
-+                 }
-+                 if (cbf_cr[i])
-+                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-+@@ -1075,7 +1105,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                 if (lc->cu.pred_mode == MODE_INTRA) {
-+                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
-+                                                     trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
-++#else
-+                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
-++#endif
-+                 }
-+                 if (cbf_cb[i])
-+                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-+@@ -1085,7 +1119,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                 if (lc->cu.pred_mode == MODE_INTRA) {
-+                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
-+                                                 trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
-++#else
-+                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
-++#endif
-+                 }
-+                 if (cbf_cr[i])
-+                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-+@@ -1097,26 +1135,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
-+             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
-+             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
-++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
-++#else
-+             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
-+             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
-++#endif
-+             if (s->ps.sps->chroma_format_idc == 2) {
-+                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
-+                                                 trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
-++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
-++#else
-+                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
-+                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
-++#endif
-+             }
-+         } else if (blk_idx == 3) {
-+             int trafo_size_h = 1 << (log2_trafo_size + 1);
-+             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
-+             ff_hevc_set_neighbour_available(s, xBase, yBase,
-+                                             trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
-++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
-++#else
-+             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
-+             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
-++#endif
-+             if (s->ps.sps->chroma_format_idc == 2) {
-+                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
-+                                                 trafo_size_h, trafo_size_v);
-++#ifdef RPI
-++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
-++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
-++#else
-+                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
-+                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
-++#endif
-+             }
-+         }
-+     }
-+@@ -2291,6 +2349,31 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
-+ }
-+ 
-++#ifdef RPI
-++static void rpi_execute_pred_cmds(HEVCContext *s)
-++{
-++  int i;
-++  HEVCPredCmd *cmd = s->univ_pred_cmds;
-++  HEVCLocalContext *lc = s->HEVClc;
-++
-++  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
-++      if (cmd->type == RPI_PRED_INTRA) {
-++          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-++          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-++          lc->na.cand_left         = (cmd->na >> 3) & 1;
-++          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
-++          lc->na.cand_up           = (cmd->na >> 1) & 1;
-++          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-++          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-++      } else {
-++          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-++      }
-++  }
-++  s->num_pred_cmds = 0;
-++  s->num_coeffs = 0;
-++}
-++#endif
-++
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ {
-+     HEVCContext *s  = avctxt->priv_data;
-+@@ -2300,6 +2383,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int y_ctb       = 0;
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-++#ifdef RPI
-++    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-++#endif
-++
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+         return AVERROR_INVALIDDATA;
-+@@ -2329,6 +2416,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-++#ifdef RPI
-++        rpi_execute_pred_cmds(s);
-++#endif
-+         if (more_data < 0) {
-+             s->tab_slice_address[ctb_addr_rs] = -1;
-+             return more_data;
-+@@ -2374,6 +2464,10 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-+     s = s1->sList[self_id];
-+     lc = s->HEVClc;
-+ 
-++#ifdef RPI
-++    s->enable_rpi = 0;
-++#endif
-++
-+     if(ctb_row) {
-+         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
-+ 
-+@@ -2998,6 +3092,13 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+ 
-+     av_freep(&s->cabac_state);
-+ 
-++#ifdef RPI
-++    av_freep(&s->unif_mv_cmds);
-++    av_freep(&s->unif_xfm_cmds);
-++    av_freep(&s->univ_pred_cmds);
-++    av_freep(&s->coeffs_buf);
-++#endif
-++
-+     for (i = 0; i < 3; i++) {
-+         av_freep(&s->sao_pixel_buffer_h[i]);
-+         av_freep(&s->sao_pixel_buffer_v[i]);
-+@@ -3057,6 +3158,22 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->HEVClcList[0] = s->HEVClc;
-+     s->sList[0] = s;
-+ 
-++#ifdef RPI
-++    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-++    if (!s->unif_mv_cmds)
-++        goto fail;
-++    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
-++    if (!s->unif_xfm_cmds)
-++        goto fail;
-++    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-++    if (!s->univ_pred_cmds)
-++        goto fail;
-++    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
-++    if (!s->coeffs_buf)
-++        goto fail;
-++    s->enable_rpi = 0;
-++#endif
-++
-+     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-+     if (!s->cabac_state)
-+         goto fail;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index d84e661..aa66b00 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -23,6 +23,9 @@
-+ #ifndef AVCODEC_HEVC_H
-+ #define AVCODEC_HEVC_H
-+ 
-++// define RPI to split the CABAC/prediction/transform into separate stages
-++#include "config.h"
-++
-+ #include "libavutil/buffer.h"
-+ #include "libavutil/md5.h"
-+ 
-+@@ -816,6 +819,49 @@ typedef struct HEVCLocalContext {
-+     int boundary_flags;
-+ } HEVCLocalContext;
-+ 
-++#ifdef RPI
-++
-++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-++#define RPI_MAX_WIDTH 2048
-++
-++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
-++#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
-++#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-++// Each block can have an intra prediction and a transform_add command
-++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-++
-++// Command for inter prediction
-++typedef struct HEVCMvCmd {
-++} HEVCMvCmd;
-++
-++// Command for transform to process a block of coefficients
-++typedef struct HEVCXfmCmd {
-++} HEVCXfmCmd;
-++
-++// Command for intra prediction and transform_add of predictions to coefficients
-++#define RPI_PRED_TRANSFORM_ADD 0
-++#define RPI_PRED_INTRA 1
-++typedef struct HEVCPredCmd {
-++    uint8_t size;
-++    uint8_t type;
-++    uint8_t na;
-++    uint8_t c_idx;
-++    union {
-++        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
-++        uint32_t x;   // RPI_PRED_INTRA
-++    };
-++    union {
-++        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
-++        uint32_t y;   // RPI_PRED_INTRA
-++    };
-++    union {
-++        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
-++        uint32_t stride;         // RPI_PRED_INTRA
-++    };
-++} HEVCPredCmd;
-++
-++#endif
-++
-+ typedef struct HEVCContext {
-+     const AVClass *c;  // needed by private avoptions
-+     AVCodecContext *avctx;
-+@@ -831,6 +877,18 @@ typedef struct HEVCContext {
-+     int                 width;
-+     int                 height;
-+ 
-++#ifdef RPI
-++    int enable_rpi;
-++    HEVCMvCmd *unif_mv_cmds;
-++    HEVCXfmCmd *unif_xfm_cmds;
-++    HEVCPredCmd *univ_pred_cmds;
-++    int16_t *coeffs_buf;
-++    int num_mv_cmds;
-++    int num_xfm_cmds;
-++    int num_pred_cmds;
-++    int num_coeffs;
-++#endif
-++
-+     uint8_t *cabac_state;
-+ 
-+     /** 1 if the independent slice segment header was successfully parsed */
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index d1bef83..c0fdfad 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1510,6 +1510,21 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+         }
-+     }
-++#ifdef RPI
-++    if (s->enable_rpi) {
-++        int16_t *c = s->coeffs_buf + s->num_coeffs;
-++        int n = trafo_size * trafo_size;
-++        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-++        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
-++        s->num_coeffs += n;
-++        cmd->type = RPI_PRED_TRANSFORM_ADD;
-++        cmd->size = log2_trafo_size;
-++        cmd->buf = c;
-++        cmd->dst = dst;
-++        cmd->stride = stride;
-++        return;
-++    }
-++#endif
-+     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
-+ }
-+ 
-+-- 
-+2.5.0
-+
-+
-+From f4cf5194f103463ebd84eb36f571be06ca2aa49d Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 30 Apr 2015 15:23:22 +0100
-+Subject: [PATCH 03/68] Added simple VPU test code
-+
-+---
-+ libavcodec/Makefile             |    7 +
-+ libavcodec/hevc.c               |   33 +-
-+ libavcodec/rpi_hevc_transform.h |  212 ++++++
-+ libavcodec/rpi_hevc_transform.s |  147 ++++
-+ libavcodec/rpi_mailbox.c        |  293 ++++++++
-+ libavcodec/rpi_mailbox.h        |   20 +
-+ libavcodec/rpi_qpu.c            |  652 ++++++++++++++++++
-+ libavcodec/rpi_qpu.h            |   45 ++
-+ libavcodec/rpi_shader.c         |  818 ++++++++++++++++++++++
-+ libavcodec/rpi_shader.h         |   20 +
-+ libavcodec/rpi_shader.qasm      | 1413 +++++++++++++++++++++++++++++++++++++++
-+ libavcodec/rpi_user_vcsm.h      |  425 ++++++++++++
-+ 12 files changed, 4084 insertions(+), 1 deletion(-)
-+ create mode 100644 libavcodec/rpi_hevc_transform.h
-+ create mode 100644 libavcodec/rpi_hevc_transform.s
-+ create mode 100644 libavcodec/rpi_mailbox.c
-+ create mode 100644 libavcodec/rpi_mailbox.h
-+ create mode 100644 libavcodec/rpi_qpu.c
-+ create mode 100644 libavcodec/rpi_qpu.h
-+ create mode 100644 libavcodec/rpi_shader.c
-+ create mode 100644 libavcodec/rpi_shader.h
-+ create mode 100644 libavcodec/rpi_shader.qasm
-+ create mode 100644 libavcodec/rpi_user_vcsm.h
-+
-+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-+index 5088304..54e14b4 100644
-+--- a/libavcodec/Makefile
-++++ b/libavcodec/Makefile
-+@@ -4,6 +4,10 @@ NAME = avcodec
-+ 
-+ HEADERS = avcodec.h                                                     \
-+           avfft.h                                                       \
-++          rpi_qpu.h                                                     \
-++          rpi_shader.h                                                  \
-++          rpi_mailbox.h                                                 \
-++          rpi_hevc_transform.h                                          \
-+           dv_profile.h                                                  \
-+           d3d11va.h                                                     \
-+           dxva2.h                                                       \
-+@@ -35,6 +39,9 @@ OBJS = allcodecs.o                                                      \
-+        resample.o                                                       \
-+        resample2.o                                                      \
-+        utils.o                                                          \
-++       rpi_qpu.o                                                        \
-++       rpi_shader.o                                                     \
-++       rpi_mailbox.o                                                    \
-+        vorbis_parser.o                                                  \
-+        xiph.o                                                           \
-+ 
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 5566ace..e58a3d0 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -39,6 +39,10 @@
-+ #include "golomb.h"
-+ #include "hevc.h"
-+ 
-++#ifdef RPI
-++#include "rpi_qpu.h"
-++#endif
-++
-+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ 
-+ /**
-+@@ -2417,7 +2421,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ #ifdef RPI
-+-        rpi_execute_pred_cmds(s);
-++        if (x_ctb + ctb_size >= s->ps.sps->width) {
-++            rpi_execute_pred_cmds(s);
-++        }
-+ #endif
-+         if (more_data < 0) {
-+             s->tab_slice_address[ctb_addr_rs] = -1;
-+@@ -3172,6 +3178,31 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     if (!s->coeffs_buf)
-+         goto fail;
-+     s->enable_rpi = 0;
-++
-++    // A little test program
-++    {
-++      GPU_MEM_PTR_T p;
-++      int err = gpu_malloc_cached(16, &p);
-++      short *q = (short *)p.arm;
-++      int i;
-++      int r;
-++      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
-++      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
-++      printf("Preparing data %p\n",q);
-++      for(i=0;i<16;i++)
-++        q[i] = i;
-++      printf("Flush cache\n");
-++      gpu_cache_flush(&p);
-++      printf("Executing code\n");
-++      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
-++      printf("Return value %d (",r);
-++      for(i=0;i<16;i++)
-++        printf("%d ",q[i]);
-++      printf(")\n");
-++      gpu_free(&p);
-++      goto fail; // Early out
-++    }
-++
-+ #endif
-+ 
-+     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+new file mode 100644
-+index 0000000..85a9102
-+--- /dev/null
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -0,0 +1,212 @@
-++unsigned char rpi_hevc_transform [] = {
-++169,
-++3,
-++3,
-++232,
-++128,
-++0,
-++0,
-++0,
-++20,
-++248,
-++0,
-++136,
-++0,
-++0,
-++192,
-++248,
-++0,
-++0,
-++0,
-++96,
-++3,
-++232,
-++32,
-++0,
-++0,
-++0,
-++7,
-++232,
-++0,
-++2,
-++0,
-++0,
-++8,
-++232,
-++0,
-++4,
-++0,
-++0,
-++12,
-++248,
-++0,
-++128,
-++0,
-++0,
-++192,
-++8,
-++4,
-++0,
-++4,
-++232,
-++64,
-++0,
-++0,
-++0,
-++5,
-++232,
-++0,
-++0,
-++8,
-++0,
-++128,
-++69,
-++113,
-++66,
-++12,
-++248,
-++0,
-++128,
-++0,
-++0,
-++192,
-++8,
-++4,
-++0,
-++128,
-++69,
-++113,
-++70,
-++128,
-++144,
-++39,
-++0,
-++4,
-++255,
-++48,
-++192,
-++128,
-++3,
-++32,
-++8,
-++16,
-++0,
-++76,
-++254,
-++48,
-++192,
-++9,
-++4,
-++32,
-++8,
-++0,
-++0,
-++4,
-++254,
-++0,
-++144,
-++128,
-++2,
-++0,
-++248,
-++62,
-++0,
-++128,
-++144,
-++22,
-++0,
-++4,
-++255,
-++48,
-++192,
-++128,
-++3,
-++32,
-++8,
-++16,
-++0,
-++76,
-++254,
-++48,
-++192,
-++9,
-++4,
-++32,
-++8,
-++0,
-++0,
-++140,
-++248,
-++44,
-++0,
-++0,
-++0,
-++32,
-++48,
-++4,
-++0,
-++128,
-++69,
-++113,
-++66,
-++242,
-++140,
-++211,
-++192,
-++41,
-++3,
-++68,
-++192,
-++80,
-++7,
-++164,
-++255,
-++36,
-++220,
-++96,
-++2,
-++0,
-++248,
-++62,
-++0,
-++3,
-++255,
-++55,
-++208,
-++120,
-++3,
-++224,
-++3,
-++190,
-++11,
-++16,
-++139,
-++246,
-++83,
-++0,
-++103,
-++90,
-++0,
-++8,
-++240,
-++0,
-++128,
-++128,
-++3,
-++0,
-++247,
-++32,
-++128,
-++10,
-++4,
-++136,
-++240,
-++32,
-++0,
-++128,
-++3,
-++112,
-++96,
-++90,
-++0,
-++};
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+new file mode 100644
-+index 0000000..5e2728d
-+--- /dev/null
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -0,0 +1,147 @@
-++# ******************************************************************************
-++# Argon Design Ltd.
-++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-++#
-++# Module : HEVC
-++# Author : Peter de Rivaz
-++# ******************************************************************************
-++
-++# HEVC VPU Transform
-++#
-++# Transform matrix can be thought of as
-++#   output row vector = input row vector * transMatrix2
-++#
-++# The even rows of the matrix are symmetric
-++# The odd rows of the matrix are antisymmetric
-++#
-++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-++#
-++# EXAMPLE
-++#   (a b c d) (1 2  2  1)
-++#             (3 4 -4 -3)
-++#             (5 6  6  5)
-++#             (7 8 -8 -7)
-++#
-++#  x=(a c)(1 2) = 1a+5c 2a+6c
-++#         (5 6)
-++#
-++#  y=(b d)(3 4) = 3b+7d 4b+8d
-++#         (7 8)
-++#
-++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-++#
-++#  Final results are (u , v[::-1])
-++#
-++#
-++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-++#  Apply the even matrix first and stop before rounding
-++#  Then apply the odd matrix in a full manner:
-++#
-++#   First step is to compute partial products with the first input (16 cycles)
-++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-++#   2a 4b 6c 8d
-++#   2a -4b 6c -8d
-++#   1a -3b 5c -7d
-++#
-++#   Second step is to sum partial products into final position (8 cycles)
-++#   1a+3b+5c+7d
-++#   2a+4b+6c+8d
-++#   2a-4b+6c-8d
-++#   1a-3b+5c-7d
-++#
-++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-++#
-++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-++#
-++#   For 8x8 we could compute two in parallel.
-++#
-++#
-++
-++test_add:
-++  vldh HX(0,0),(r0)
-++  vadd HX(0,0),HX(0,0),10
-++  vsth HX(0,0),(r0)
-++  mov r0,7 # return value
-++  b lr
-++
-++# Columns are transformed first
-++#
-++# Store top left half of transMatrix2 in
-++# Store bottom left half of transMatrix2 in HX(32,32)
-++#
-++# For 16x16
-++# HX(0:15,0) contains input data before transform
-++# HY(0:15,0) contains 32bit output data after transform
-++# HX(32,0) contains even rows of left half of transMatrix2
-++# HX(32,32) contains odd rows of left half of transMatrix2
-++# HY(48,0) contains partial products ready for summing
-++#
-++
-++
-++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
-++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-++# num: number of 16x16 transforms to be done
-++#
-++hevc_trans_16x16:
-++  push r6-r15, lr # TODO cut down number of used registers
-++
-++  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
-++  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-++  # Now use r0 to describe which matrix we are working on.
-++  # Allows us to prefetch the next block of coefficients for efficiency.
-++  mov r0,0 # This describes the location where we read our coefficients from
-++  mov r3,16*2 # Stride of coefficients in bytes
-++  mov r7,16*16*2 # Total block size
-++  mov r8,64*16 # Value used to swap from current to next VRF location
-++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-++  mov r4,64 # Constant used for rounding first pass
-++  mov r5,1<<19 # Constant used for rounding second pass
-++
-++  # At start of block r0,r1 point to the current block (that has already been loaded)
-++block_loop:
-++  eor r0,r8
-++  add r1,r7
-++  # Prefetch the next block
-++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-++  eor r0,r8
-++  sub r1,r7
-++
-++  # Transform the current block
-++  bl col_trans_16
-++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-++  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
-++
-++  bl col_trans_16
-++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-++
-++  # Save results - note there has been a transposition during the processing so we save columns
-++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-++
-++  # Move onto next block
-++  eor r0,r8
-++  add r1,r7
-++
-++  addcmpbgt r2,-1,0,block_loop
-++  pop r6-r15, pc
-++
-++# r1,r2,r3 r7,r8 should be preserved
-++# HX(0++,0)+r0 is the block to be transformed
-++# HX(32++,0) is the 16x16 matrix of transform coefficients
-++# Use HY(48,0) for intermediate results
-++# r0 can be used, but should be returned to its original value at the end
-++col_trans_16:
-++  add r4,r0,16 # Final value for this loop
-++col_trans_16_loop:
-++  # First compute partial products for a single column
-++  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
-++  # Then sum up the results and place back
-++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-++  addcmpblt r0,1,r4,col_trans_16_loop
-++  sub r0,16  # but r0 back to its original value
-++  b lr
-+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-+new file mode 100644
-+index 0000000..536896f
-+--- /dev/null
-++++ b/libavcodec/rpi_mailbox.c
-+@@ -0,0 +1,293 @@
-++/*
-++Copyright (c) 2012, Broadcom Europe Ltd.
-++All rights reserved.
-++
-++Redistribution and use in source and binary forms, with or without
-++modification, are permitted provided that the following conditions are met:
-++    * Redistributions of source code must retain the above copyright
-++      notice, this list of conditions and the following disclaimer.
-++    * Redistributions in binary form must reproduce the above copyright
-++      notice, this list of conditions and the following disclaimer in the
-++      documentation and/or other materials provided with the distribution.
-++    * Neither the name of the copyright holder nor the
-++      names of its contributors may be used to endorse or promote products
-++      derived from this software without specific prior written permission.
-++
-++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-++*/
-++
-++#include <stdio.h>
-++#include <string.h>
-++#include <stdlib.h>
-++#include <fcntl.h>
-++#include <unistd.h>
-++#include <assert.h>
-++#include <stdint.h>
-++#include <sys/mman.h>
-++#include <sys/ioctl.h>
-++
-++#include <linux/ioctl.h>
-++
-++#define MAJOR_NUM 100
-++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-++#define DEVICE_FILE_NAME "/dev/char_dev"
-++
-++#include "rpi_mailbox.h"
-++
-++#define PAGE_SIZE (4*1024)
-++
-++// Shared memory will not be cached in ARM cache
-++void *mapmem_shared(unsigned base, unsigned size)
-++{
-++   int mem_fd;
-++   unsigned offset = base % PAGE_SIZE;
-++   base = base - offset;
-++   /* open /dev/mem */
-++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-++      return NULL;
-++   }
-++   void *mem = mmap(
-++      0,
-++      size,
-++      PROT_READ|PROT_WRITE,
-++      MAP_SHARED/*|MAP_FIXED*/,
-++      mem_fd,
-++      base);
-++#ifdef DEBUG
-++   printf("base=0x%x, mem=%p\n", base, mem);
-++#endif
-++   if (mem == MAP_FAILED) {
-++      printf("mmap error %d\n", (int)mem);
-++      return NULL;
-++   }
-++   close(mem_fd);
-++   return (char *)mem + offset;
-++}
-++
-++// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
-++void *mapmem_private(unsigned base, unsigned size)
-++{
-++   int mem_fd;
-++   unsigned offset = base % PAGE_SIZE;
-++   base = base - offset;
-++   /* open /dev/mem */
-++   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-++      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-++      return NULL;
-++   }
-++   void *mem = mmap(
-++      0,
-++      size,
-++      PROT_READ|PROT_WRITE,
-++      MAP_PRIVATE/*|MAP_FIXED*/,
-++      mem_fd,
-++      base);
-++#ifdef DEBUG
-++   printf("base=0x%x, mem=%p\n", base, mem);
-++#endif
-++   if (mem == MAP_FAILED) {
-++      printf("mmap error %d\n", (int)mem);
-++      return NULL;
-++   }
-++   close(mem_fd);
-++   return (char *)mem + offset;
-++}
-++
-++void unmapmem(void *addr, unsigned size)
-++{
-++   int s = munmap(addr, size);
-++   if (s != 0) {
-++      printf("munmap error %d\n", s);
-++      exit (-1);
-++   }
-++}
-++
-++/*
-++ * use ioctl to send mbox property message
-++ */
-++
-++static int mbox_property(int file_desc, void *buf)
-++{
-++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-++
-++   if (ret_val < 0) {
-++      printf("ioctl_set_msg failed:%d\n", ret_val);
-++   }
-++
-++#ifdef DEBUG
-++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-++   for (i=0; i<size/4; i++)
-++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-++#endif
-++   return ret_val;
-++}
-++
-++unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x3000c; // (the tag id)
-++   p[i++] = 12; // (size of the buffer)
-++   p[i++] = 12; // (size of the data)
-++   p[i++] = size; // (num bytes? or pages?)
-++   p[i++] = align; // (alignment)
-++   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned mem_free(int file_desc, unsigned handle)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x3000f; // (the tag id)
-++   p[i++] = 4; // (size of the buffer)
-++   p[i++] = 4; // (size of the data)
-++   p[i++] = handle;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned mem_lock(int file_desc, unsigned handle)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x3000d; // (the tag id)
-++   p[i++] = 4; // (size of the buffer)
-++   p[i++] = 4; // (size of the data)
-++   p[i++] = handle;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned mem_unlock(int file_desc, unsigned handle)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x3000e; // (the tag id)
-++   p[i++] = 4; // (size of the buffer)
-++   p[i++] = 4; // (size of the data)
-++   p[i++] = handle;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-++{
-++   int i=0;
-++   unsigned p[32];
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x30010; // (the tag id)
-++   p[i++] = 28; // (size of the buffer)
-++   p[i++] = 28; // (size of the data)
-++   p[i++] = code;
-++   p[i++] = r0;
-++   p[i++] = r1;
-++   p[i++] = r2;
-++   p[i++] = r3;
-++   p[i++] = r4;
-++   p[i++] = r5;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned qpu_enable(int file_desc, unsigned enable)
-++{
-++   int i=0;
-++   unsigned p[32];
-++
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++
-++   p[i++] = 0x30012; // (the tag id)
-++   p[i++] = 4; // (size of the buffer)
-++   p[i++] = 4; // (size of the data)
-++   p[i++] = enable;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
-++   int i=0;
-++   unsigned p[32];
-++
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++   p[i++] = 0x30011; // (the tag id)
-++   p[i++] = 16; // (size of the buffer)
-++   p[i++] = 16; // (size of the data)
-++   p[i++] = num_qpus;
-++   p[i++] = control;
-++   p[i++] = noflush;
-++   p[i++] = timeout; // ms
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return p[5];
-++}
-++
-++int mbox_open() {
-++   int file_desc;
-++
-++   // open a char device file used for communicating with kernel mbox driver
-++   file_desc = open(DEVICE_FILE_NAME, 0);
-++   if (file_desc < 0) {
-++      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
-++      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
-++   }
-++   return file_desc;
-++}
-++
-++void mbox_close(int file_desc) {
-++  close(file_desc);
-++}
-+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-+new file mode 100644
-+index 0000000..c264d2e
-+--- /dev/null
-++++ b/libavcodec/rpi_mailbox.h
-+@@ -0,0 +1,20 @@
-++#ifndef RPI_MAILBOX_H
-++#define RPI_MAILBOX_H
-++
-++extern int mbox_open(void);
-++extern void mbox_close(int file_desc);
-++
-++extern unsigned get_version(int file_desc);
-++extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
-++extern unsigned mem_free(int file_desc, unsigned handle);
-++extern unsigned mem_lock(int file_desc, unsigned handle);
-++extern unsigned mem_unlock(int file_desc, unsigned handle);
-++extern void *mapmem_shared(unsigned base, unsigned size);
-++extern void *mapmem_private(unsigned base, unsigned size);
-++extern void unmapmem(void *addr, unsigned size);
-++
-++extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-++extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-++extern unsigned qpu_enable(int file_desc, unsigned enable);
-++
-++#endif
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+new file mode 100644
-+index 0000000..b1f50ee
-+--- /dev/null
-++++ b/libavcodec/rpi_qpu.c
-+@@ -0,0 +1,652 @@
-++#ifdef RPI
-++// Use the vcsm device for shared memory
-++// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-++#define RPI_USE_VCSM
-++#define RPI_TIME_TOTAL_QPU
-++
-++#include <stdio.h>
-++#include <stdlib.h>
-++#include <string.h>
-++#include <stddef.h>
-++#include <assert.h>
-++
-++#include "config.h"
-++
-++#include <pthread.h>
-++#include <time.h>
-++
-++#include "rpi_mailbox.h"
-++#include "rpi_qpu.h"
-++#include "rpi_shader.h"
-++#include "rpi_hevc_transform.h"
-++
-++#ifdef RPI_USE_VCSM
-++#include "rpi_user_vcsm.h"
-++#endif
-++
-++// On Pi2 there is no way to access the VPU L2 cache
-++// GPU_MEM_FLG should be 4 for uncached memory.
-++// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-++// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-++#define GPU_MEM_FLG 0xC
-++#define GPU_MEM_MAP 0x0
-++
-++#define vcos_verify(x) ((x)>=0)
-++
-++typedef unsigned char uint8_t;
-++typedef signed char int8_t;
-++typedef unsigned short uint16_t;
-++typedef unsigned int uint32_t;
-++typedef int int32_t;
-++
-++/*static const unsigned code[] =
-++{
-++  #include "rpi_shader.hex"
-++};*/
-++
-++// Size in 32bit words
-++#define QPU_CODE_SIZE 2048
-++#define VPU_CODE_SIZE 2048
-++
-++struct GPU
-++{
-++  unsigned int qpu_code[QPU_CODE_SIZE];
-++  unsigned int vpu_code[VPU_CODE_SIZE];
-++  int open_count; // Number of allocated video buffers
-++  unsigned int vc_handle; // Handle of this memory
-++  int      mb; // Mailbox handle
-++  int      vc; // Address in GPU memory
-++  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-++};
-++
-++// Stop more than one thread trying to allocate memory or use the processing resources at once
-++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-++static volatile struct GPU* gpu = NULL;
-++
-++#ifdef RPI_TIME_TOTAL_QPU
-++static unsigned int Microseconds(void) {
-++    struct timespec ts;
-++    unsigned int x;
-++    static unsigned int base = 0;
-++    clock_gettime(CLOCK_REALTIME, &ts);
-++    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
-++    if (base==0) base=x;
-++    return x-base;
-++}
-++#endif
-++
-++// Connect to QPU, returns 0 on success.
-++static int gpu_init(volatile struct GPU **gpu) {
-++  int mb = mbox_open();
-++  int vc;
-++  int handle;
-++  volatile struct GPU* ptr;
-++	if (mb < 0)
-++		return -1;
-++
-++	if (qpu_enable(mb, 1)) return -2;
-++
-++#ifdef RPI_USE_VCSM
-++  vcsm_init();
-++#endif
-++
-++  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
-++  if (!handle)
-++  {
-++    qpu_enable(mb, 0);
-++    return -3;
-++  }
-++	vc = mem_lock(mb, handle);
-++	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
-++	if (ptr == NULL)
-++	{	mem_free(mb, handle);
-++		mem_unlock(mb, handle);
-++		qpu_enable(mb, 0);
-++		return -4;
-++	}
-++
-++	ptr->mb = mb;
-++	ptr->vc_handle = handle;
-++	ptr->vc = vc;
-++
-++  *gpu = ptr;
-++
-++  // Now copy over the QPU code into GPU memory
-++  {
-++    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
-++    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-++    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-++  }
-++  // And the VPU code
-++  {
-++    int num_bytes = sizeof(rpi_hevc_transform);
-++    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-++    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-++  }
-++
-++  return 0;
-++}
-++
-++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-++static void gpu_lock(void) {
-++  pthread_mutex_lock(&gpu_mutex);
-++  if (gpu==NULL) {
-++    gpu_init(&gpu);
-++  }
-++}
-++
-++static void gpu_unlock(void) {
-++  pthread_mutex_unlock(&gpu_mutex);
-++}
-++
-++// Allocate memory on GPU
-++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-++// Returns 0 on success.
-++// This allocates memory that will not be cached in ARM's data cache.
-++// Therefore safe to use without data cache flushing.
-++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
-++  gpu_lock();
-++  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-++  p->vcsm_handle = 0;
-++  if (!p->vc_handle)
-++  {
-++    qpu_enable(gpu->mb, 0);
-++    return -3;
-++  }
-++  p->vc = mem_lock(gpu->mb, p->vc_handle);
-++  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-++  p->numbytes = numbytes;
-++  if (p->arm == NULL)
-++  {
-++    mem_free(gpu->mb, p->vc_handle);
-++    mem_unlock(gpu->mb, p->vc_handle);
-++    gpu_unlock();
-++    qpu_enable(gpu->mb, 0);
-++    return -4;
-++  }
-++  gpu->open_count++;
-++  gpu_unlock();
-++  return 0;
-++}
-++
-++void gpu_cache_flush(GPU_MEM_PTR_T *p)
-++{
-++  // This only works when using RPI_USE_VCSM
-++  void *tmp = vcsm_lock(p->vcsm_handle);
-++  vcsm_unlock_ptr(tmp);
-++}
-++
-++// This allocates data that will be
-++//    Cached in ARM L2
-++//    Uncached in VPU L2
-++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
-++  gpu_lock();
-++#ifdef RPI_USE_VCSM
-++  {
-++      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
-++      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
-++      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
-++      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
-++      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-++      p->arm = vcsm_lock(p->vcsm_handle);
-++      p->vc = mem_lock(gpu->mb, p->vc_handle);
-++  }
-++#else
-++  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-++  p->vcsm_handle = 0;
-++  if (!p->handle)
-++  {
-++    qpu_enable(gpu->mb, 0);
-++    return -3;
-++  }
-++  p->vc = mem_lock(gpu->mb, p->vc_handle);
-++  printf("This mapmem_private does not seem to work\n");
-++  exit(-1);
-++  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-++  p->numbytes = numbytes;
-++  if (p->arm == NULL)
-++  {
-++    mem_free(gpu->mb, p->handle);
-++    mem_unlock(gpu->mb, p->handle);
-++    gpu_unlock();
-++    qpu_enable(gpu->mb, 0);
-++    return -4;
-++  }
-++#endif
-++  gpu->open_count++;
-++  gpu_unlock();
-++  return 0;
-++}
-++
-++static void gpu_term(void)
-++{
-++	int mb = gpu->mb;
-++	unsigned handle = gpu->vc_handle;
-++  if (gpu==NULL)
-++    return;
-++	unmapmem((void*)gpu, sizeof(struct GPU));
-++	mem_unlock(mb, handle);
-++	mem_free(mb, handle);
-++	qpu_enable(mb, 0);
-++#ifdef RPI_USE_VCSM
-++  vcsm_exit();
-++#endif
-++	mbox_close(mb);
-++  gpu = NULL;
-++}
-++
-++void gpu_free(GPU_MEM_PTR_T *p) {
-++  int mb = gpu->mb;
-++	unsigned handle = p->vc_handle;
-++  gpu_lock();
-++#ifdef RPI_USE_VCSM
-++  if (p->vcsm_handle) {
-++      mem_unlock(mb,p->vc_handle);
-++      vcsm_unlock_ptr(p->arm);
-++      vcsm_free(p->vcsm_handle);
-++  } else {
-++	unmapmem((void*)p->arm, sizeof(struct GPU));
-++      mem_unlock(mb, handle);
-++      mem_free(mb, handle);
-++  }
-++#else
-++	unmapmem((void*)p->arm, sizeof(struct GPU));
-++	mem_unlock(mb, handle);
-++	mem_free(mb, handle);
-++#endif
-++
-++  gpu->open_count--;
-++  if (gpu->open_count==0) {
-++      printf("Closing GPU\n");
-++      gpu_term();
-++      gpu = NULL;
-++  }
-++  gpu_unlock();
-++}
-++
-++unsigned int vpu_get_fn(void) {
-++  // Make sure that the gpu is initialized
-++  if (gpu==NULL) {
-++    printf("Preparing gpu\n");
-++    gpu_lock();
-++    gpu_unlock();
-++  }
-++  return gpu->vc + offsetof(struct GPU,vpu_code);
-++}
-++
-++unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-++{
-++  unsigned r;
-++  gpu_lock();
-++  r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
-++  gpu_unlock();
-++  return r;
-++}
-++
-++// Run a program on a QPU with the given code and uniform stream (given in GPU addresses)
-++// The first num QPUs will start at code, the next num2 QPUs will start at code2
-++void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12)
-++{
-++  int i;
-++#ifdef RPI_TIME_TOTAL_QPU
-++  static int last_time=0;
-++  static long long on_time=0;
-++  static long long off_time=0;
-++  int start_time;
-++  int end_time;
-++  static int count=0;
-++#endif
-++
-++  gpu_lock();
-++#ifdef RPI_TIME_TOTAL_QPU
-++  start_time = Microseconds();
-++  if (last_time==0)
-++    last_time = start_time;
-++  off_time += start_time-last_time;
-++#endif
-++  for(i=0;i<num;i++) {
-++    gpu->mail[i*2 + 1] = code;
-++  }
-++  for(;i<num+num2;i++) {
-++    gpu->mail[i*2 + 1] = code2;
-++  }
-++  gpu->mail[0 ] = unifs1;
-++  gpu->mail[2 ] = unifs2;
-++  gpu->mail[4 ] = unifs3;
-++  gpu->mail[6 ] = unifs4;
-++  gpu->mail[8 ] = unifs5;
-++  gpu->mail[10] = unifs6;
-++	gpu->mail[12] = unifs7;
-++	gpu->mail[14] = unifs8;
-++	gpu->mail[16] = unifs9;
-++	gpu->mail[18] = unifs10;
-++	gpu->mail[20] = unifs11;
-++	gpu->mail[22] = unifs12;
-++	execute_qpu(
-++		gpu->mb,
-++		12 /* Number of QPUs */,
-++		gpu->vc + offsetof(struct GPU, mail),
-++		1 /* no flush */,  // Don't flush VPU L1 cache
-++		5000 /* timeout ms */);
-++#ifdef RPI_TIME_TOTAL_QPU
-++  end_time = Microseconds();
-++  last_time = end_time;
-++  on_time += end_time - start_time;
-++  count++;
-++  if ((count&0x7f)==0)
-++    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-++  gpu_unlock();
-++}
-++
-++unsigned int qpu_get_fn(int num) {
-++    // Make sure that the gpu is initialized
-++    unsigned int *fn;
-++    if (gpu==NULL) {
-++      printf("Preparing gpu\n");
-++      gpu_lock();
-++      gpu_unlock();
-++    }
-++    switch(num) {
-++    case QPU_MC_SETUP:
-++      fn = mc_setup;
-++      break;
-++    case QPU_MC_FILTER:
-++      fn = mc_filter;
-++      break;
-++    case QPU_MC_EXIT:
-++      fn = mc_exit;
-++      break;
-++    case QPU_MC_INTERRUPT_EXIT:
-++      fn = mc_interrupt_exit;
-++      break;
-++    case QPU_MC_FILTER_B:
-++      fn = mc_filter_b;
-++      break;
-++    case QPU_MC_FILTER_HONLY:
-++      fn = mc_filter_honly;
-++      break;
-++    case QPU_MC_SETUP_UV:
-++      fn = mc_setup_uv;
-++      break;
-++    case QPU_MC_FILTER_UV:
-++      fn = mc_filter_uv;
-++      break;
-++    case QPU_MC_FILTER_UV_B:
-++      fn = mc_filter_uv_b;
-++      break;
-++    case QPU_MC_END:
-++      fn = mc_end;
-++      break;
-++    default:
-++      printf("Unknown function\n");
-++      exit(-1);
-++    }
-++    return gpu->vc + 4*(int)(fn-rpi_shader);
-++    //return code[num] + gpu->vc;
-++}
-++
-++#if 0
-++
-++int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-++//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-++int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
-++//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-++
-++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
-++
-++static uint8_t av_clip_uint8(int32_t a)
-++{
-++    if (a&(~255)) return (-a)>>31;
-++    else          return a;
-++}
-++
-++static int32_t filter8(const uint8_t *data, int pitch)
-++{
-++   int32_t vsum = 0;
-++   int x, y;
-++
-++   for (y = 0; y < 8; y++) {
-++      int32_t hsum = 0;
-++
-++      for (x = 0; x < 8; x++)
-++         hsum += hcoeffs[x]*data[x + y * pitch];
-++
-++      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
-++   }
-++
-++   return av_clip_uint8( (vsum + 64) >> 7);
-++}
-++
-++// Note regression changes coefficients so is not thread safe
-++//#define REGRESSION
-++#ifdef REGRESSION
-++#define CMAX 100
-++#else
-++#define CMAX 2
-++#endif
-++#define YMAX 16
-++
-++int rpi_test_shader(void)
-++{
-++   int i, c;
-++
-++   uint32_t *unifs;
-++
-++   uint8_t *in_buffer;
-++   uint8_t *out_buffer[2];
-++
-++   GPU_MEM_PTR_T unifs_ptr;
-++   GPU_MEM_PTR_T in_buffer_ptr;
-++   GPU_MEM_PTR_T out_buffer_ptr[2];
-++
-++   // Addresses in GPU memory of filter programs
-++   uint32_t mc_setup = 0;
-++   uint32_t mc_filter = 0;
-++   uint32_t mc_exit = 0;
-++
-++   int pitch = 0x500;
-++
-++   if (gpu==NULL) {
-++      gpu_lock();
-++      gpu_unlock();
-++   }
-++
-++   printf("This needs to change to reflect new assembler\n");
-++   // Use table to compute locations of program start points
-++   mc_setup = code[0] + gpu->vc;
-++   mc_filter = code[1] + gpu->vc;
-++   mc_exit = code[2] + gpu->vc;
-++
-++   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-++      return -2;
-++   }
-++   unifs = (uint32_t*)unifs_ptr.arm;
-++
-++   if (!vcos_verify(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
-++      return -3;
-++   }
-++   in_buffer = (uint8_t*)in_buffer_ptr.arm;
-++
-++   if (!vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
-++      return -4;
-++   }
-++   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
-++   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
-++
-++   for (c = 0; c < CMAX; c++) {
-++      int xo[] = {rand()&31, rand()&31};
-++
-++#ifdef REGRESSION
-++      for (i = 0; i < 8; i++) {
-++         hcoeffs[i] = (int8_t)rand();
-++         vcoeffs[i] = (int8_t)rand();
-++         if (hcoeffs[i]==-128)
-++           hcoeffs[i]++;
-++         if (vcoeffs[i]==-128)
-++           vcoeffs[i]++;
-++      }
-++#endif
-++
-++      for (i = 0; i < 64*23; i++) {
-++         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
-++         in_buffer[i] = rand();
-++      }
-++
-++      // Clear output array
-++      {
-++        int b;
-++        for(b=0;b<2;b++) {
-++          for(i=0;i<16*16;i++) {
-++            out_buffer[b][i] = 3;
-++          }
-++        }
-++      }
-++
-++      unifs[0] = mc_filter;
-++      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
-++      unifs[2] = 64; // src pitch
-++      unifs[3] = pitch; // dst pitch
-++      unifs[4] = 0; // Padding
-++      unifs[5] = 0;
-++      unifs[6] = 0;
-++      unifs[7 ] = mc_filter;
-++      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
-++      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-++      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-++      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-++      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-++      unifs[13] = out_buffer_ptr[0].vc;
-++      unifs[14] = mc_exit;
-++      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
-++      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-++      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-++      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-++      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-++      unifs[20] = out_buffer_ptr[1].vc;
-++
-++      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-++
-++      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
-++
-++      //qpu_run_shader(mc_setup, unifs_ptr.vc);
-++      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
-++      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
-++      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
-++
-++      if (1)
-++      {
-++         int x, y, b;
-++         int bad = 0;
-++
-++         for (b=0; b<2; ++b)
-++            for (y=0; y<YMAX; ++y)
-++               for (x=0; x<16; ++x) {
-++                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
-++
-++                  if (out_buffer[b][x+y*pitch] != ref) {
-++                      bad = 1;
-++//                     printf("%d, %d, %d, %d\n", c, b, x, y);
-++                  }
-++#ifndef REGRESSION
-++                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
-++#endif
-++               }
-++          if (bad)
-++            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-++          else
-++            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-++      }
-++      //printf("%d\n", simpenrose_get_qpu_tick_count());
-++   }
-++
-++   gpu_free(&out_buffer_ptr[0]);
-++   gpu_free(&out_buffer_ptr[1]);
-++   gpu_free(&in_buffer_ptr);
-++   gpu_free(&unifs_ptr);
-++
-++   return 0;
-++}
-++
-++void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
-++{
-++  int x,y;
-++  for (y=0; y<16; ++y) {
-++    for (x=0; x<16; ++x) {
-++       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
-++    }
-++  }
-++}
-++
-++void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
-++{
-++   uint32_t *unifs;
-++
-++   GPU_MEM_PTR_T unifs_ptr;
-++   //uint8_t *out_buffer;
-++   //GPU_MEM_PTR_T out_buffer_ptr;
-++
-++   // Addresses in GPU memory of filter programs
-++   uint32_t mc_setup = 0;
-++   uint32_t mc_filter = 0;
-++   uint32_t mc_exit = 0;
-++   //int x,y;
-++
-++   if (gpu==NULL) {
-++      gpu_lock();
-++      gpu_unlock();
-++   }
-++
-++   // Use table to compute locations of program start points
-++   mc_setup = code[0] + gpu->vc;
-++   mc_filter = code[1] + gpu->vc;
-++   mc_exit = code[2] + gpu->vc;
-++
-++   if (!vcos_verify(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-++      return;
-++   }
-++   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
-++   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
-++
-++   /*for (y=0; y<16; ++y) {
-++      for (x=0; x<16; ++x) {
-++         out_buffer[x+y*dst_pitch] = 7;
-++      }
-++    }*/
-++
-++   unifs = (uint32_t*)unifs_ptr.arm;
-++
-++    unifs[0] = mc_filter;
-++    unifs[1] = (int)in_buffer_vc;
-++    unifs[2] = src_pitch; // src pitch
-++    unifs[3] = dst_pitch; // dst pitch
-++    unifs[4] = 0; // Padding
-++    unifs[5] = 0;
-++    unifs[6] = 0;
-++    unifs[7 ] = mc_exit;
-++    unifs[8 ] = (int)in_buffer_vc;
-++    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-++    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-++    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-++    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-++    unifs[13] = (int)dst_vc;
-++    //unifs[13] = (int)out_buffer_ptr.vc;
-++
-++    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-++
-++    qpu_run_shader(mc_setup, unifs_ptr.vc);
-++
-++    /*for (y=0; y<16; ++y) {
-++      for (x=0; x<16; ++x) {
-++         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
-++      }
-++    }*/
-++
-++    gpu_free(&unifs_ptr);
-++    //gpu_free(&out_buffer_ptr);
-++}
-++
-++
-++#endif
-++
-++#endif // RPI
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+new file mode 100644
-+index 0000000..4e3c35c
-+--- /dev/null
-++++ b/libavcodec/rpi_qpu.h
-+@@ -0,0 +1,45 @@
-++#ifndef RPI_QPU_H
-++#define RPI_QPU_H
-++
-++typedef struct gpu_mem_ptr_s {
-++  unsigned char *arm; // Pointer to memory mapped on ARM side
-++  int vc_handle;   // Videocore handle of relocatable memory
-++  int vcsm_handle; // Handle for use by VCSM
-++  int vc;       // Address for use in GPU code
-++  int numbytes; // Size of memory block
-++} GPU_MEM_PTR_T;
-++
-++// General GPU functions
-++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-++extern void gpu_free(GPU_MEM_PTR_T *p);
-++extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-++
-++// QPU specific functions
-++extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
-++
-++enum {
-++  QPU_MC_SETUP,
-++  QPU_MC_FILTER,
-++  QPU_MC_EXIT,
-++  QPU_MC_INTERRUPT_EXIT,
-++  QPU_MC_FILTER_B,
-++  QPU_MC_FILTER_HONLY,
-++  QPU_MC_SETUP_UV,
-++  QPU_MC_FILTER_UV,
-++  QPU_MC_FILTER_UV_B,
-++  QPU_MC_END
-++  };
-++extern unsigned int qpu_get_fn(int num);
-++
-++// VPU specific functions
-++extern unsigned int vpu_get_fn(void);
-++extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-++
-++// Simple test of shader code
-++extern int rpi_test_shader(void);
-++
-++extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
-++extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
-++
-++#endif
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+new file mode 100644
-+index 0000000..41cc2e1
-+--- /dev/null
-++++ b/libavcodec/rpi_shader.c
-+@@ -0,0 +1,818 @@
-++#include "rpi_shader.h"
-++
-++#ifdef _MSC_VER
-++   #include <stdint.h>
-++   /* cast through uintptr_t to avoid warnings */
-++   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-++#else
-++   #define POINTER_TO_UINT(X) ((unsigned int)(X))
-++#endif
-++
-++#ifdef __cplusplus
-++extern "C" { /* the types are probably wrong... */
-++#endif
-++#ifdef __cplusplus
-++}
-++#endif
-++
-++#ifdef _MSC_VER
-++__declspec(align(8))
-++#elif defined(__GNUC__)
-++__attribute__((aligned(8)))
-++#endif
-++unsigned int rpi_shader[] = {
-++// ::mc_setup
-++/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-++/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-++/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
-++/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-++/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
-++/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
-++/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
-++/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++// ::mc_filter_uv
-++/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :uvloop
-++/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter
-++/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-++/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :loop
-++/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-++/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-++/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// :fast_path
-++/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :fast_loop
-++/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-++/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-++/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-++/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-++/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-++/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-++/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-++/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-++/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-++/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-++/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-++/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-++/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-++/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-++/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-++/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_b
-++/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :bloop
-++/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-++/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-++/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-++/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_honly
-++/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-++/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-++/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :loop_honly
-++/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-++/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-++/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-++/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-++/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-++/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-++/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-++/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_exit
-++/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_exit1
-++/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit
-++/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit4
-++/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit8
-++/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_setup_uv
-++/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-++/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-++/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-++/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-++/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
-++/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-++/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-++/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-++/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++// ::mc_filter_uv_b
-++/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :uvloop_b
-++/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_end
-++};
-++#ifdef __HIGHC__
-++#pragma Align_to(8, rpi_shader)
-++#endif
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+new file mode 100644
-+index 0000000..db971f4
-+--- /dev/null
-++++ b/libavcodec/rpi_shader.h
-+@@ -0,0 +1,20 @@
-++#ifndef rpi_shader_H
-++#define rpi_shader_H
-++
-++extern unsigned int rpi_shader[];
-++
-++#define mc_setup (rpi_shader + 0)
-++#define mc_filter_uv (rpi_shader + 146)
-++#define mc_filter (rpi_shader + 360)
-++#define mc_filter_b (rpi_shader + 670)
-++#define mc_filter_honly (rpi_shader + 894)
-++#define mc_exit (rpi_shader + 1048)
-++#define mc_exit1 (rpi_shader + 1066)
-++#define mc_interrupt_exit (rpi_shader + 1082)
-++#define mc_interrupt_exit4 (rpi_shader + 1120)
-++#define mc_interrupt_exit8 (rpi_shader + 1142)
-++#define mc_setup_uv (rpi_shader + 1172)
-++#define mc_filter_uv_b (rpi_shader + 1314)
-++#define mc_end (rpi_shader + 1542)
-++
-++#endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+new file mode 100644
-+index 0000000..6851e83
-+--- /dev/null
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -0,0 +1,1413 @@
-++# register allocation
-++#
-++# ra0...ra7                                     eight horizontal filter coefficients
-++#
-++# rb1...rb7                                     seven shifted copies of the current unfiltered row
-++#
-++# ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
-++#
-++#                                               (ra15 isn't clamped to zero - this happens during the
-++#                                                copy to ra14, and during its use in the vertical filter)
-++#
-++# rb8...rb15                                    eight vertical filter coefficients
-++#
-++# ra16                                          clipped(row start address+elem_num)&~3
-++# ra17                                          per-channel shifts
-++# ra19                                          next ra17
-++#
-++# rb16                                          pitch
-++# rb17                                          height + 5
-++# rb18                                          height + 7
-++# rb19                                          next ra16
-++#
-++# ra20                                          1
-++# ra21                                          64
-++# ra22                                          256
-++# ra23                                          8
-++#
-++# rb20                                          0xffffff00
-++# rb21                                          64
-++# rb22                                          255
-++# rb23                                          24
-++#
-++# rb24                                          vdw_setup_1(dst_pitch)
-++# rb25                                          frame width-1
-++# rb26                                          height<<23 + width<<16 + vdw_setup_0
-++# rb27                                          vdw_setup_0 (depends on QPU number)
-++# rb28                                          vpm_setup (depends on QPU number)
-++# rb29                                          vdw_setup_1(dst_pitch-width)
-++# rb30                                          frame height-1
-++# rb31                                          used as temp to count loop iterations
-++#
-++# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
-++# ra24                                          clipped(row start address+8+elem_num)&~3
-++# ra25                                          per-channel shifts 2
-++# ra26                                          next ra24
-++# ra27                                          next ra25
-++# ra28                                          next y
-++# ra29                                          y for next texture access
-++#
-++# ra31                                          next kernel address
-++
-++.set rb_frame_width_minus_1,       rb25
-++.set rb_frame_height_minus_1,      rb30
-++.set rb_pitch,                     rb16
-++.set ra_x_base,                    ra16
-++.set rb_x_base_next,               rb19
-++.set ra_x2_base,                   ra24
-++.set ra_x2_base_next,              ra26
-++.set ra_xshift,                    ra17
-++
-++.set ra_x2shift,                   ra25
-++.set ra_u2v_ref_offset,            ra25
-++
-++.set ra_xshift_next,               ra19
-++
-++.set ra_x2shift_next,              ra27
-++.set ra_u2v_dst_offset,            ra27
-++
-++.set ra_y_next,                    ra28
-++.set ra_y,                         ra29
-++
-++.set rb_const_64,                  rb21
-++
-++# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
-++::mc_setup
-++
-++# Read starting kernel
-++mov ra31, unif
-++
-++# Load first request location
-++add ra_x_base, unif, elem_num # Store x
-++mov ra_y, unif # Store y
-++mov ra_x2_base, unif # Store frame base
-++
-++# Read image dimensions
-++sub rb25,unif,1
-++sub rb30,unif,1
-++
-++# get source pitch
-++mov rb16, unif
-++
-++# get destination pitch
-++mov r0, unif
-++mov r1, vdw_setup_1(0)
-++add rb24, r1, r0
-++
-++# load constants
-++
-++mov ra20, 1
-++mov ra21, 64
-++mov ra22, 256
-++mov ra23, 8
-++
-++mov rb20, 0xffffff00
-++mov rb21, 64
-++mov rb22, 255
-++mov rb23, 24
-++
-++# touch vertical context to keep simulator happy
-++
-++mov ra8, 0
-++mov ra9, 0
-++mov ra10, 0
-++mov ra11, 0
-++mov ra12, 0
-++mov ra13, 0
-++mov ra14, 0
-++mov ra15, 0
-++
-++# Compute part of VPM to use for DMA output
-++mov r2, qpu_num
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++shl r0, r0, 5
-++add rb27, r0, r1
-++
-++# Compute part of VPM to save data into
-++mov r2, qpu_num
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vpm_setup(0, 4, h8p(0, 0))
-++add rb28, r0, r1
-++
-++# Compute base address for first and second access
-++#add r0, unif, elem_num     # x
-++mov r0, ra_x_base           # Load x
-++add r2, r0, 8               # x+8
-++max r0, r0, 0; mov r1, ra_y # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-++shl ra_xshift_next, r0, 3
-++max r2, r2, 0
-++add ra_y, r1, 1
-++min r2, r2, rb_frame_width_minus_1
-++shl ra_x2shift_next, r2, 3
-++max r1, r1, 0  # y
-++min r1, r1, rb_frame_height_minus_1
-++add r0, r0, r3; mul24 r1, r1, rb_pitch
-++add r2, r2, r3
-++and r0, r0, ~3
-++and r2, r2, ~3; mov ra_x_base, r0
-++# submit texture requests for first line
-++add t0s, r0, r1 ; mov ra_x2_base, r2
-++add t0s, r2, r1
-++
-++# Dump padding words
-++mov r0, unif
-++mov r0, unif
-++
-++# submit texture requests for second line
-++max r1, ra_y, 0
-++min r1, r1, rb_frame_height_minus_1
-++add ra_y, ra_y, 1
-++bra -, ra31
-++nop ; mul24 r1, r1, rb_pitch
-++add t0s, r1, ra_x_base
-++add t0s, r1, ra_x2_base
-++
-++################################################################################
-++
-++# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter_uv
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-++shl ra_xshift_next, r0, 3
-++sub r2, unif, r3 # compute offset from frame base u to frame base v
-++add r0, r0, r3
-++and rb_x_base_next, r0, ~3
-++mov ra_y_next, r1
-++add ra_x2_base_next, rb_x_base_next, r2
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++asr rb12, r0, rb23
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:uvloop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:uvloop
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++brr.anyn -, r:uvloop
-++asr r1, r1, 15
-++min r1, r1, rb22
-++max vpm, r1, 0
-++
-++# DMA out for U
-++
-++mov vw_setup, rb26 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++# DMA out for V
-++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-++# Could potentially push this write into the start of the next pipeline stage.
-++mov r0, 16
-++mov -, vw_wait
-++
-++bra -, ra31
-++add vw_setup, rb26, r0 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++################################################################################
-++
-++
-++# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov ra_x2shift, ra_x2shift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++add r2, r0, 8 # x+8
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-++shl ra_xshift_next, r0, 3
-++max r2, r2, 0
-++min r2, r2, rb_frame_width_minus_1
-++shl ra_x2shift_next, r2, 3
-++add r0, r0, r3
-++add r2, r2, r3
-++and rb_x_base_next, r0, ~3
-++and ra_x2_base_next, r2, ~3
-++mov ra_y_next, r1
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++brr.anynn -, r:fast_path
-++asr rb12, r0, rb23  # delay slot 1
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8 # delay slot 2
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++## nop                                                                 ; ldtmu0     # loop counter increment
-++## shr r0, r4, ra17                                                    ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21         ; mul24 r3, r0, ra0
-++## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++## sub r2, r2, r3                                                    ; ldtmu0
-++##
-++## mov r0, ra22
-++## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # apply horizontal filter
-++##
-++## asr r2, r2, 15    ; mul24 r3, r0, ra0
-++## min r2, r2, rb22
-++## max ra13, r2, 0
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21
-++## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
-++## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-++## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-++## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-++## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-++## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-++## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-++## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++## sub r0, r2, r3
-++##
-++## # apply horizontal filter
-++##
-++## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-++## asr r0, r0, 15
-++## min r0, r0, rb22
-++## max ra14, r0, 0
-++##
-++##
-++##
-++##
-++## nop                                                                 ; ldtmu0     # loop counter increment
-++## shr r0, r4, ra17                                                    ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21         ; mul24 r3, r0, ra0
-++## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++## sub r0, r2, r3
-++##
-++## # apply horizontal filter
-++##
-++## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-++## asr r0, r0, 15
-++## min r0, r0, rb22
-++## max ra15, r0, 0
-++
-++
-++
-++
-++mov r3, 0
-++
-++:loop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:loop
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++brr.anyn -, r:loop
-++asr r1, r1, 15
-++min r1, r1, rb22
-++max vpm, r1, 0
-++
-++# DMA out
-++
-++bra -, ra31
-++mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-++mov vw_setup, rb29
-++mov vw_addr, unif # start the VDW
-++
-++####################################################
-++
-++:fast_path
-++## nop                                                                 ; ldtmu0     # loop counter increment
-++## shr r0, r4, ra17                                                    ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21         ; mul24 r3, r0, ra0
-++## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-++## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-++## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-++## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-++## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-++## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-++## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-++## sub r2, r2, r3                                                    ; ldtmu0
-++##
-++## mov r0, ra22
-++## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # apply horizontal filter
-++##
-++## asr r2, r2, 15    ; mul24 r3, r0, ra0
-++## min r2, r2, rb22
-++## max ra13, r2, 0
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21
-++## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-++## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-++## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-++## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-++## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-++## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-++## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-++## sub r0, r2, r3
-++##
-++## # apply horizontal filter
-++##
-++## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-++## asr r0, r0, 15
-++## min r0, r0, rb22
-++## max ra14, r0, 0
-++##
-++##
-++##
-++##
-++## nop                                                                 ; ldtmu0     # loop counter increment
-++## shr r0, r4, ra17                                                    ; ldtmu0
-++## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-++## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-++## add ra16, ra16, rb16 ; mov t0s, ra16
-++##
-++## # generate seven shifted versions
-++## # interleave with scroll of vertical context
-++##
-++## mov r2, rb21   ; mul24    r3, r0, ra0
-++## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-++## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-++## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-++## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-++## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-++## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-++## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-++## sub r0, r2, r3
-++##
-++## # apply horizontal filter
-++##
-++## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-++## asr r0, r0, 15
-++## min r0, r0, rb22
-++## max ra15, r0, 0
-++
-++
-++mov r3, 0  # This signifies the amount of unrolling
-++
-++:fast_loop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-++mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-++
-++max r2, ra_y, 0
-++min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-++sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-++sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-++sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-++sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-++sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-++sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-++sub r0, r2, r3       ; mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8       ; mov r1, ra22
-++
-++# apply horizontal filter
-++
-++brr.anyn -, r:fast_loop
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++brr.anyn -, r:fast_loop
-++asr r1, r1, 15
-++min r1, r1, rb22
-++max vpm, r1, 0
-++
-++# DMA out
-++
-++bra -, ra31
-++mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-++mov vw_setup, rb29
-++mov vw_addr, unif # start the VDW
-++
-++################################################################################
-++
-++# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter_b
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov ra_x2shift, ra_x2shift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++add r2, r0, 8 # x+8
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-++shl ra_xshift_next, r0, 3
-++max r2, r2, 0
-++min r2, r2, rb_frame_width_minus_1
-++shl ra_x2shift_next, r2, 3
-++add r0, r0, r3
-++add r2, r2, r3
-++and rb_x_base_next, r0, ~3
-++and ra_x2_base_next, r2, ~3
-++mov ra_y_next, r1
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++# r0 is currently height<<7
-++# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-++shl r3, r0, 13
-++shl r3, r3, 8 # Mask off top 8 bits
-++shr r3, r3, 8
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++# In a B frame, so also set up VPM read
-++add vr_setup, r3, rb28
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++asr rb12, r0, rb23
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++mov r3, 0
-++
-++:bloop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:bloop
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 15          ; mov -, vr_wait
-++min r1, r1, rb22
-++add r0, vpm, 1          # Blend in previous VPM contents at this location
-++brr.anyn -, r:bloop
-++max r1, r1, 0
-++add r1, r1, r0
-++shr vpm, r1, 1
-++
-++# DMA out
-++
-++bra -, ra31
-++mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-++mov vw_setup, rb29
-++mov vw_addr, unif # start the VDW
-++
-++################################################################################
-++
-++# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-++# This filter only does horizontal filtering.
-++# It is assumed that the region to fetch does not include extra rows above.
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter_honly
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov ra_x2shift, ra_x2shift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++add r2, r0, 8 # x+8
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-++shl ra_xshift_next, r0, 3
-++max r2, r2, 0
-++min r2, r2, rb_frame_width_minus_1
-++shl ra_x2shift_next, r2, 3
-++add r0, r0, r3
-++add r2, r2, r3
-++and rb_x_base_next, r0, ~3
-++and ra_x2_base_next, r2, ~3
-++mov ra_y_next, r1
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
-++shl r0, r0, 7 ; mov rb18,r0
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++mov r0, unif
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-++mov r3, 0
-++
-++:loop_honly
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3       ; mov r3, rb31
-++
-++sub.setf -, r3, rb18 ; mov r1, ra22
-++
-++mov -, vw_wait   ; mul24 r0, r0, r1
-++brr.anyn -, r:loop_honly
-++asr r0, r0, 15          # delay 1
-++min r0, r0, rb22        # delay 2
-++max vpm, r0, 0          # delay 3
-++
-++# DMA out
-++bra -, ra31
-++mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-++mov vw_setup, rb29
-++mov vw_addr, unif # start the VDW
-++
-++
-++################################################################################
-++
-++# mc_exit()
-++
-++::mc_exit
-++mov  -, vw_wait # wait on the VDW
-++
-++mov -,srel(0)
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++nop        ; nop ; thrend
-++nop        ; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++::mc_exit1
-++mov  -, vw_wait # wait on the VDW
-++
-++#mov -,srel(1)
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++# mc_interrupt_exit()
-++::mc_interrupt_exit
-++mov  -, vw_wait # wait on the VDW
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++mov -,sacq(0) # 1
-++mov -,sacq(0) # 2
-++mov -,sacq(0) # 3
-++mov -,sacq(0) # 4
-++mov -,sacq(0) # 5
-++mov -,sacq(0) # 6
-++mov -,sacq(0) # 7
-++mov -,sacq(0) # 8
-++mov -,sacq(0) # 9
-++mov -,sacq(0) # 10
-++mov -,sacq(0) # 11
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++# mc_interrupt_exit4()
-++::mc_interrupt_exit4
-++mov  -, vw_wait # wait on the VDW
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++mov -,sacq(0) # 1
-++mov -,sacq(0) # 2
-++mov -,sacq(0) # 3
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++# mc_interrupt_exit8()
-++::mc_interrupt_exit8
-++mov  -, vw_wait # wait on the VDW
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++mov -,sacq(0) # 1
-++mov -,sacq(0) # 2
-++mov -,sacq(0) # 3
-++mov -,sacq(0) # 4
-++mov -,sacq(0) # 5
-++mov -,sacq(0) # 6
-++mov -,sacq(0) # 7
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++################################################################################
-++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-++::mc_setup_uv
-++
-++# Read starting kernel
-++mov ra31, unif
-++
-++# Load first request location
-++add ra_x_base, unif, elem_num # Store x
-++mov ra_y, unif # Store y
-++mov ra_x2_base, unif # Store frame u base
-++nop
-++sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-++
-++# Read image dimensions
-++sub rb25,unif,1
-++sub rb30,unif,1
-++
-++# get source pitch
-++mov rb16, unif
-++
-++# get destination pitch
-++mov r0, unif
-++mov r1, vdw_setup_1(0)
-++add rb24, r1, r0
-++
-++# load constants
-++
-++mov ra20, 1
-++mov ra21, 64
-++mov ra22, 256
-++mov ra23, 8
-++
-++mov rb20, 0xffffff00
-++mov rb21, 64
-++mov rb22, 255
-++mov rb23, 24
-++
-++# touch vertical context to keep simulator happy
-++
-++mov ra8, 0
-++mov ra9, 0
-++mov ra10, 0
-++mov ra11, 0
-++mov ra12, 0
-++mov ra13, 0
-++mov ra14, 0
-++mov ra15, 0
-++
-++# Compute part of VPM to use for DMA output
-++mov r2, qpu_num
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++shl r0, r0, 5
-++add rb27, r0, r1
-++
-++# Compute part of VPM to save data into
-++mov r2, qpu_num
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vpm_setup(0, 4, h8p(0, 0))
-++add rb28, r0, r1
-++
-++# Compute base address for first and second access
-++mov r0, ra_x_base           # Load x
-++max r0, r0, 0; mov r1, ra_y # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++add ra_y, r1, 1
-++add r0, r0, r3
-++and r0, r0, ~3
-++max r1, r1, 0 ; mov ra_x_base, r0 # y
-++min r1, r1, rb_frame_height_minus_1
-++# submit texture requests for first line
-++add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++add t0s, r0, r1 ; mov ra_x2_base, r2
-++add t0s, r2, r1
-++
-++# Dump padding words
-++mov r0, unif
-++mov r0, unif
-++mov r0, unif
-++
-++# submit texture requests for second line
-++max r1, ra_y, 0
-++min r1, r1, rb_frame_height_minus_1
-++add ra_y, ra_y, 1
-++bra -, ra31
-++nop ; mul24 r1, r1, rb_pitch
-++add t0s, r1, ra_x_base
-++add t0s, r1, ra_x2_base
-++
-++
-++
-++################################################################################
-++
-++::mc_filter_uv_b
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-++shl ra_xshift_next, r0, 3
-++sub r2, unif, r3 # compute offset from frame base u to frame base v
-++add r0, r0, r3
-++and rb_x_base_next, r0, ~3
-++mov ra_y_next, r1
-++add ra_x2_base_next, rb_x_base_next, r2
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++
-++# r0 is currently height<<7
-++# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-++shl r3, r0, 13
-++shl r3, r3, 8 # Mask off top 8 bits
-++shr r3, r3, 8
-++
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++# In a B frame, so also set up VPM read
-++add vr_setup, r3, rb28
-++
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++asr rb12, r0, rb23
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:uvloop_b
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r2, rb21         ; mul24 r3, r0, ra0
-++nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++sub r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:uvloop_b
-++max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr r0, r0, 15          ; mov r1, ra21
-++min.setf ra15, r0, rb22
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r0, ra14, rb14
-++sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 15
-++min r1, r1, rb22
-++add r0, vpm, 1          # Blend in previous VPM contents at this location
-++brr.anyn -, r:uvloop_b
-++max r1, r1, 0
-++add r1, r1, r0
-++shr vpm, r1, 1
-++
-++
-++# DMA out for U
-++
-++mov vw_setup, rb26 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++# DMA out for V
-++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-++# Could potentially push this write into the start of the next pipeline stage.
-++mov r0, 16
-++mov -, vw_wait
-++
-++bra -, ra31
-++add vw_setup, rb26, r0 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++::mc_end
-+diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-+new file mode 100644
-+index 0000000..fbebbbe
-+--- /dev/null
-++++ b/libavcodec/rpi_user_vcsm.h
-+@@ -0,0 +1,425 @@
-++/*
-++Copyright (c) 2012, Broadcom Europe Ltd
-++All rights reserved.
-++
-++Redistribution and use in source and binary forms, with or without
-++modification, are permitted provided that the following conditions are met:
-++    * Redistributions of source code must retain the above copyright
-++      notice, this list of conditions and the following disclaimer.
-++    * Redistributions in binary form must reproduce the above copyright
-++      notice, this list of conditions and the following disclaimer in the
-++      documentation and/or other materials provided with the distribution.
-++    * Neither the name of the copyright holder nor the
-++      names of its contributors may be used to endorse or promote products
-++      derived from this software without specific prior written permission.
-++
-++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-++*/
-++
-++#ifndef __USER_VCSM__H__INCLUDED__
-++#define __USER_VCSM__H__INCLUDED__
-++
-++/* VideoCore Shared Memory - user interface library.
-++**
-++** This library provides all the necessary abstraction for any application to
-++** make use of the shared memory service which is distributed accross a kernel
-++** driver and a videocore service.
-++**
-++** It is an application design decision to choose or not to use this service.
-++**
-++** The logical flow of operations that a user application needs to follow when
-++** using this service is:
-++**
-++**       1) Initialize the service.
-++**       2) Allocate shared memory blocks.
-++**       3) Start using the allocated blocks.
-++**          - In order to gain ownership on a block, lock the allocated block,
-++**            locking a block returns a valid address that the user application
-++**            can access.
-++**          - When finished with using the block for the current execution cycle
-++**            or function, and so when giving up the ownership, unlock the block.
-++**       4) A block can be locked/unlocked as many times required - within or outside
-++**          of - a specific execution context.
-++**       5) To completely release an allocated block, free it.
-++**       6) If the service is no longer required, terminate it.
-++**
-++**
-++** Some generic considerations:
-++
-++** Allocating memory blocks.
-++**
-++**   Memory blocks can be allocated in different manners depending on the cache
-++**   behavior desired.  A given block can either be:
-++
-++**       - Allocated in a non cached fashion all the way through host and videocore.
-++**       - Allocated in a cached fashion on host OR videocore.
-++**       - Allocated in a cached fashion on host AND videocore.
-++**
-++**   It is an application decision to determine how to allocate a block.  Evidently
-++**   if the application will be doing substantial read/write accesses to a given block,
-++**   it is recommended to allocate the block at least in a 'host cached' fashion for
-++**   better results.
-++**
-++**
-++** Locking memory blocks.
-++**
-++**   When the memory block has been allocated in a host cached fashion, locking the
-++**   memory block (and so taking ownership of it) will trigger a cache invalidation.
-++**
-++**   For the above reason and when using host cached allocation, it is important that
-++**   an application properly implements the lock/unlock mechanism to ensure cache will
-++**   stay coherent, otherwise there is no guarantee it will at all be.
-++**
-++**   It is possible to dynamically change the host cache behavior (ie cached or non
-++**   cached) of a given allocation without needing to free and re-allocate the block.
-++**   This feature can be useful for such application which requires access to the block
-++**   only at certain times and not otherwise.  By changing the cache behavior dynamically
-++**   the application can optimize performances for a given duration of use.
-++**   Such dynamic cache behavior remapping only applies to host cache and not videocore
-++**   cache.  If one requires to change the videocore cache behavior, then a new block
-++**   must be created to replace the old one.
-++**
-++**   On successful locking, a valid pointer is returned that the application can use
-++**   to access to data inside the block.  There is no guarantee that the pointer will
-++**   stay valid following the unlock action corresponding to this lock.
-++**
-++**
-++** Unocking memory blocks.
-++**
-++**   When the memory block has been allocated in a host cached fashion, unlocking the
-++**   memory block (and so forgiving its ownership) will trigger a cache flush unless
-++**   explicitely asked not to flush the cache for performances reasons.
-++**
-++**   For the above reason and when using host cached allocation, it is important that
-++**   an application properly implements the lock/unlock mechanism to ensure cache will
-++**   stay coherent, otherwise there is no guarantee it will at all be.
-++**
-++**
-++** A complete API is defined below.
-++*/
-++
-++#ifdef __cplusplus
-++extern "C"
-++{
-++#endif
-++
-++/* Different status that can be dumped.
-++*/
-++typedef enum
-++{
-++   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
-++                                    // Result of the walk is seen in the videocore
-++                                    // log.
-++   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
-++                                    // driver (ie for all processes).  Result of
-++                                    // the walk is seen in the kernel log.
-++   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
-++                                    // driver (for current process).  Result of
-++                                    // the walk is seen in the kernel log.
-++   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-++                                    // driver (for current process).  Result of
-++                                    // the walk is seen in the kernel log.
-++   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-++                                    // VCSM_STATUS_HOST_WALK_MAP.
-++                                    //
-++   VCSM_STATUS_NONE,                // Must be last - invalid.
-++
-++} VCSM_STATUS_T;
-++
-++/* Different kind of cache behavior.
-++*/
-++typedef enum
-++{
-++   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
-++   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
-++   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
-++   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
-++
-++} VCSM_CACHE_TYPE_T;
-++
-++/* Initialize the vcsm processing.
-++**
-++** Must be called once before attempting to do anything else.
-++**
-++** Returns 0 on success, -1 on error.
-++*/
-++int vcsm_init( void );
-++
-++
-++/* Terminates the vcsm processing.
-++**
-++** Must be called vcsm services are no longer needed, it will
-++** take care of removing any allocation under the current process
-++** control if deemed necessary.
-++*/
-++void vcsm_exit( void );
-++
-++
-++/* Queries the status of the the vcsm.
-++**
-++** Triggers dump of various kind of information, see the
-++** different variants specified in VCSM_STATUS_T.
-++**
-++** Pid is optional.
-++*/
-++void vcsm_status( VCSM_STATUS_T status, int pid );
-++
-++
-++/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-++** allocator.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** On success, the user must invoke vcsm_lock with the returned opaque
-++** handle to gain access to the memory associated with the opaque handle.
-++** When finished using the memory, the user calls vcsm_unlock_xx (see those
-++** function definition for more details on the one that can be used).
-++**
-++** A well behaved application should make every attempt to lock/unlock
-++** only for the duration it needs to access the memory data associated with
-++** the opaque handle.
-++*/
-++unsigned int vcsm_malloc( unsigned int size, char *name );
-++
-++
-++/* Allocates a cached block of memory of size 'size' via the vcsm memory
-++** allocator, the type of caching requested is passed as argument of the
-++** function call.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** On success, the user must invoke vcsm_lock with the returned opaque
-++** handle to gain access to the memory associated with the opaque handle.
-++** When finished using the memory, the user calls vcsm_unlock_xx (see those
-++** function definition for more details on the one that can be used).
-++**
-++** A well behaved application should make every attempt to lock/unlock
-++** only for the duration it needs to access the memory data associated with
-++** the opaque handle.
-++*/
-++unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-++
-++
-++/* Shares an allocated block of memory via the vcsm memory allocator.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** On success, the user must invoke vcsm_lock with the returned opaque
-++** handle to gain access to the memory associated with the opaque handle.
-++** When finished using the memory, the user calls vcsm_unlock_xx (see those
-++** function definition for more details on the one that can be used).
-++**
-++** A well behaved application should make every attempt to lock/unlock
-++** only for the duration it needs to access the memory data associated with
-++** the opaque handle.
-++*/
-++unsigned int vcsm_malloc_share( unsigned int handle );
-++
-++
-++/* Resizes a block of memory allocated previously by vcsm_alloc.
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** The handle must be unlocked by user prior to attempting any
-++** resize action.
-++**
-++** On error, the original size allocated against the handle
-++** remains available the same way it would be following a
-++** successful vcsm_malloc.
-++*/
-++int vcsm_resize( unsigned int handle, unsigned int new_size );
-++
-++
-++/* Frees a block of memory that was successfully allocated by
-++** a prior call the vcms_alloc.
-++**
-++** The handle should be considered invalid upon return from this
-++** call.
-++**
-++** Whether any memory is actually freed up or not as the result of
-++** this call will depends on many factors, if all goes well it will
-++** be freed.  If something goes wrong, the memory will likely end up
-++** being freed up as part of the vcsm_exit process.  In the end the
-++** memory is guaranteed to be freed one way or another.
-++*/
-++void vcsm_free( unsigned int handle );
-++
-++
-++/* Retrieves a videocore opaque handle from a mapped user address
-++** pointer.  The videocore handle will correspond to the actual
-++** memory mapped in videocore.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** Note: the videocore opaque handle is distinct from the user
-++**       opaque handle (allocated via vcsm_malloc) and it is only
-++**       significant for such application which knows what to do
-++**       with it, for the others it is just a number with little
-++**       use since nothing can be done with it (in particular
-++**       for safety reason it cannot be used to map anything).
-++*/
-++unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-++
-++
-++/* Retrieves a videocore opaque handle from a opaque handle
-++** pointer.  The videocore handle will correspond to the actual
-++** memory mapped in videocore.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++**
-++** Note: the videocore opaque handle is distinct from the user
-++**       opaque handle (allocated via vcsm_malloc) and it is only
-++**       significant for such application which knows what to do
-++**       with it, for the others it is just a number with little
-++**       use since nothing can be done with it (in particular
-++**       for safety reason it cannot be used to map anything).
-++*/
-++unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
-++
-++
-++/* Retrieves a user opaque handle from a mapped user address
-++** pointer.
-++**
-++** Returns:        0 on error
-++**                 a non-zero opaque handle on success.
-++*/
-++unsigned int vcsm_usr_handle( void *usr_ptr );
-++
-++
-++/* Retrieves a mapped user address from an opaque user
-++** handle.
-++**
-++** Returns:        0 on error
-++**                 a non-zero address on success.
-++**
-++** On success, the address corresponds to the pointer
-++** which can access the data allocated via the vcsm_malloc
-++** call.
-++*/
-++void *vcsm_usr_address( unsigned int handle );
-++
-++
-++/* Locks the memory associated with this opaque handle.
-++**
-++** Returns:        NULL on error
-++**                 a valid pointer on success.
-++**
-++** A user MUST lock the handle received from vcsm_malloc
-++** in order to be able to use the memory associated with it.
-++**
-++** On success, the pointer returned is only valid within
-++** the lock content (ie until a corresponding vcsm_unlock_xx
-++** is invoked).
-++*/
-++void *vcsm_lock( unsigned int handle );
-++
-++
-++/* Locks the memory associated with this opaque handle.  The lock
-++** also gives a chance to update the *host* cache behavior of the
-++** allocated buffer if so desired.  The *videocore* cache behavior
-++** of the allocated buffer cannot be changed by this call and such
-++** attempt will be ignored.
-++**
-++** The system will attempt to honour the cache_update mode request,
-++** the cache_result mode will provide the final answer on which cache
-++** mode is really in use.  Failing to change the cache mode will not
-++** result in a failure to lock the buffer as it is an application
-++** decision to choose what to do if (cache_result != cache_update)
-++**
-++** The value returned in cache_result can only be considered valid if
-++** the returned pointer is non NULL.  The cache_result pointer may be
-++** NULL if the application does not care about the actual outcome of
-++** its action with regards to the cache behavior change.
-++**
-++** Returns:        NULL on error
-++**                 a valid pointer on success.
-++**
-++** A user MUST lock the handle received from vcsm_malloc
-++** in order to be able to use the memory associated with it.
-++**
-++** On success, the pointer returned is only valid within
-++** the lock content (ie until a corresponding vcsm_unlock_xx
-++** is invoked).
-++*/
-++void *vcsm_lock_cache( unsigned int handle,
-++                       VCSM_CACHE_TYPE_T cache_update,
-++                       VCSM_CACHE_TYPE_T *cache_result );
-++
-++
-++/* Unlocks the memory associated with this user mapped address.
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** After unlocking a mapped address, the user should no longer
-++** attempt to reference it.
-++*/
-++int vcsm_unlock_ptr( void *usr_ptr );
-++
-++
-++/* Unlocks the memory associated with this user mapped address.
-++** Apply special processing that would override the otherwise
-++** default behavior.
-++**
-++** If 'cache_no_flush' is specified:
-++**    Do not flush cache as the result of the unlock (if cache
-++**    flush was otherwise applicable in this case).
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** After unlocking a mapped address, the user should no longer
-++** attempt to reference it.
-++*/
-++int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
-++
-++
-++/* Unlocks the memory associated with this user opaque handle.
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** After unlocking an opaque handle, the user should no longer
-++** attempt to reference the mapped addressed once associated
-++** with it.
-++*/
-++int vcsm_unlock_hdl( unsigned int handle );
-++
-++
-++/* Unlocks the memory associated with this user opaque handle.
-++** Apply special processing that would override the otherwise
-++** default behavior.
-++**
-++** If 'cache_no_flush' is specified:
-++**    Do not flush cache as the result of the unlock (if cache
-++**    flush was otherwise applicable in this case).
-++**
-++** Returns:        0 on success
-++**                 -errno on error.
-++**
-++** After unlocking an opaque handle, the user should no longer
-++** attempt to reference the mapped addressed once associated
-++** with it.
-++*/
-++int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-++
-++#ifdef __cplusplus
-++}
-++#endif
-++
-++#endif /* __USER_VCSM__H__INCLUDED__ */
-+-- 
-+2.5.0
-+
-+
-+From 603cf327694d2f986538f13e6b8a1d92b2a9e0b2 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@argondesign.com>
-+Date: Sat, 2 May 2015 21:15:37 +0100
-+Subject: [PATCH 04/68] First working version with uncached memory
-+
-+---
-+ libavcodec/hevc.c               |  61 +++++-
-+ libavcodec/hevc.h               |  12 +-
-+ libavcodec/hevc_cabac.c         |  39 +++-
-+ libavcodec/hevc_filter.c        |  16 ++
-+ libavcodec/hevcpred_template.c  |   6 +
-+ libavcodec/rpi_hevc_transform.h | 422 +++++++++++++++++++++++++++++++++++++++-
-+ libavcodec/rpi_hevc_transform.s | 153 +++++++++++++--
-+ libavcodec/rpi_qpu.c            |  72 +++++++
-+ libavcodec/rpi_qpu.h            |   1 +
-+ 9 files changed, 736 insertions(+), 46 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index e58a3d0..4aacb60 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -43,6 +43,8 @@
-+ #include "rpi_qpu.h"
-+ #endif
-+ 
-++// #define DISABLE_MC
-++
-+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ 
-+ /**
-+@@ -1066,11 +1068,15 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                         for (i = 0; i < (size * size); i++) {
-+                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+                         }
-++                        printf("Cross component not supported\n"); // TODO
-++                        exit(-1);
-+                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-+                     }
-+             }
-+ 
-+             if (lc->tu.cross_pf) {
-++                printf("Cross component not supported\n"); // TODO
-++                exit(-1);
-+                 hls_cross_component_pred(s, 1);
-+             }
-+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-+@@ -1099,6 +1105,8 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                         for (i = 0; i < (size * size); i++) {
-+                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+                         }
-++                        printf("Cross component not supported\n"); // TODO
-++                        exit(-1);
-+                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-+                     }
-+             }
-+@@ -1396,6 +1404,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+     int idx              = ff_hevc_pel_weight[block_w];
-+ 
-++#ifdef DISABLE_MC
-++    return;
-++#endif
-++
-+     x_off += mv->x >> 2;
-+     y_off += mv->y >> 2;
-+     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-+@@ -1466,6 +1478,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
-+     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
-+ 
-++#ifdef DISABLE_MC
-++    return;
-++#endif
-++
-+     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
-+         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
-+         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-+@@ -1551,6 +1567,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+     intptr_t _mx         = mx << (1 - hshift);
-+     intptr_t _my         = my << (1 - vshift);
-+ 
-++#ifdef DISABLE_MC
-++    return;
-++#endif
-++
-+     x_off += mv->x >> (2 + hshift);
-+     y_off += mv->y >> (2 + vshift);
-+     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-+@@ -1615,6 +1635,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
-+     int hshift = s->ps.sps->hshift[1];
-+     int vshift = s->ps.sps->vshift[1];
-+ 
-++#ifdef DISABLE_MC
-++    return;
-++#endif
-++
-+     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
-+     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
-+     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-+@@ -2354,6 +2378,22 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+ }
-+ 
-+ #ifdef RPI
-++static void rpi_execute_transform(HEVCContext *s)
-++{
-++    int i=2;
-++    //int j;
-++    //int16_t *coeffs = s->coeffs_buf_arm[i];
-++    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-++    //    s->hevcdsp.idct[4-2](coeffs, 16);
-++    //}
-++
-++    //gpu_cache_flush(&s->coeffs_buf[i]);
-++    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
-++
-++    for(i=0;i<4;i++)
-++        s->num_coeffs[i] = 0;
-++}
-++
-+ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ {
-+   int i;
-+@@ -2374,7 +2414,6 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+       }
-+   }
-+   s->num_pred_cmds = 0;
-+-  s->num_coeffs = 0;
-+ }
-+ #endif
-+ 
-+@@ -2421,7 +2460,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ #ifdef RPI
-+-        if (x_ctb + ctb_size >= s->ps.sps->width) {
-++        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
-++            rpi_execute_transform(s);
-+             rpi_execute_pred_cmds(s);
-+         }
-+ #endif
-+@@ -3102,7 +3142,9 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->unif_mv_cmds);
-+     av_freep(&s->unif_xfm_cmds);
-+     av_freep(&s->univ_pred_cmds);
-+-    av_freep(&s->coeffs_buf);
-++    for(i = 0; i < 4; i++) {
-++        gpu_free(&s->coeffs_buf[i]);
-++    }
-+ #endif
-+ 
-+     for (i = 0; i < 3; i++) {
-+@@ -3174,13 +3216,16 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+-    s->coeffs_buf = av_mallocz(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16);
-+-    if (!s->coeffs_buf)
-+-        goto fail;
-++    for(i = 0; i < 4; i++) {
-++        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-++        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
-++        if (!s->coeffs_buf_arm[i])
-++            goto fail;
-++    }
-+     s->enable_rpi = 0;
-+ 
-+     // A little test program
-+-    {
-++    /*{
-+       GPU_MEM_PTR_T p;
-+       int err = gpu_malloc_cached(16, &p);
-+       short *q = (short *)p.arm;
-+@@ -3201,7 +3246,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+       printf(")\n");
-+       gpu_free(&p);
-+       goto fail; // Early out
-+-    }
-++    }*/
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index aa66b00..f201817 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -39,6 +39,11 @@
-+ #include "thread.h"
-+ #include "videodsp.h"
-+ 
-++// define RPI to split the CABAC/prediction/transform into separate stages
-++#ifdef RPI
-++#include "rpi_qpu.h"
-++#endif
-++
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+ #define MAX_REFS 16
-+ 
-+@@ -882,11 +887,12 @@ typedef struct HEVCContext {
-+     HEVCMvCmd *unif_mv_cmds;
-+     HEVCXfmCmd *unif_xfm_cmds;
-+     HEVCPredCmd *univ_pred_cmds;
-+-    int16_t *coeffs_buf;
-+-    int num_mv_cmds;
-++    GPU_MEM_PTR_T coeffs_buf[4];
-++    int16_t *coeffs_buf_arm[4];
-++    int num_coeffs[4];
-+     int num_xfm_cmds;
-++    int num_mv_cmds;
-+     int num_pred_cmds;
-+-    int num_coeffs;
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index c0fdfad..a7561bd 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1031,6 +1031,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     int vshift = s->ps.sps->vshift[c_idx];
-+     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-+                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
-+     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+     uint8_t significant_coeff_group_flag[8][8] = {{0}};
-+     int explicit_rdpcm_flag = 0;
-+@@ -1044,6 +1045,18 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     uint8_t dc_scale;
-+     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
-+                                          lc->tu.intra_pred_mode_c;
-++#ifdef RPI
-++    if (s->enable_rpi) {
-++        int n = trafo_size * trafo_size;
-++        if (use_vpu) {
-++            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
-++            s->num_coeffs[log2_trafo_size - 2] += n;
-++        } else {
-++            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
-++            s->num_coeffs[0] += n;
-++        }
-++    }
-++#endif
-+ 
-+     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+ 
-+@@ -1488,6 +1501,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-+             s->hevcdsp.idct_4x4_luma(coeffs);
-+         } else {
-++#ifdef RPI
-++            if (!use_vpu) {
-++              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-++              if (max_xy == 0)
-++                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-++              else {
-++                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-++                  if (max_xy < 4)
-++                      col_limit = FFMIN(4, col_limit);
-++                  else if (max_xy < 8)
-++                      col_limit = FFMIN(8, col_limit);
-++                  else if (max_xy < 12)
-++                      col_limit = FFMIN(24, col_limit);
-++
-++                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-++              }
-++            }
-++#else
-+             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+             if (max_xy == 0)
-+                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-+@@ -1501,6 +1532,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                     col_limit = FFMIN(24, col_limit);
-+                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-+             }
-++#endif
-+         }
-+     }
-+     if (lc->tu.cross_pf) {
-+@@ -1512,14 +1544,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     }
-+ #ifdef RPI
-+     if (s->enable_rpi) {
-+-        int16_t *c = s->coeffs_buf + s->num_coeffs;
-+-        int n = trafo_size * trafo_size;
-+         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-+-        memcpy(c, coeffs, n * sizeof(int16_t));  // TODO change pointer earlier and we can avoid this copy
-+-        s->num_coeffs += n;
-++        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
-+         cmd->type = RPI_PRED_TRANSFORM_ADD;
-+         cmd->size = log2_trafo_size;
-+-        cmd->buf = c;
-++        cmd->buf = coeffs;
-+         cmd->dst = dst;
-+         cmd->stride = stride;
-+         return;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 1f33b0c..e4c3da7 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -22,6 +22,10 @@
-+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+  */
-+ 
-++//#define DISABLE_SAO
-++//#define DISABLE_DEBLOCK
-++//#define DISABLE_STRENGTHS
-++
-+ #include "libavutil/common.h"
-+ #include "libavutil/internal.h"
-+ 
-+@@ -273,6 +277,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
-+     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
-+     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
-+ 
-++#ifdef DISABLE_SAO
-++    return;
-++#endif
-++
-+     if (restore) {
-+         if (!edges[0]) {
-+             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-+@@ -496,6 +504,10 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                 s->ps.sps->pcm.loop_filter_disable_flag) ||
-+                s->ps.pps->transquant_bypass_enable_flag;
-+ 
-++#ifdef DISABLE_DEBLOCK
-++    return;
-++#endif
-++
-+     if (x0) {
-+         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
-+         left_beta_offset = s->deblock[ctb - 1].beta_offset;
-+@@ -726,6 +738,10 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+     int boundary_upper, boundary_left;
-+     int i, j, bs;
-+ 
-++#ifdef DISABLE_STRENGTHS
-++    return;
-++#endif
-++
-+     boundary_upper = y0 > 0 && !(y0 & 7);
-+     if (boundary_upper &&
-+         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-+index 6ae87cc..71c6d52 100644
-+--- a/libavcodec/hevcpred_template.c
-++++ b/libavcodec/hevcpred_template.c
-+@@ -20,6 +20,8 @@
-+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+  */
-+ 
-++//#define DISABLE_INTRA
-++
-+ #include "libavutil/pixdesc.h"
-+ 
-+ #include "bit_depth_template.c"
-+@@ -114,6 +116,10 @@ do {                                  \
-+     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
-+                            (x0 + size_in_luma_h)) >> hshift;
-+ 
-++#ifdef DISABLE_INTRA
-++    return;
-++#endif
-++
-+     if (s->ps.pps->constrained_intra_pred_flag == 1) {
-+         int size_in_luma_pu_v = PU(size_in_luma_v);
-+         int size_in_luma_pu_h = PU(size_in_luma_h);
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index 85a9102..c0c279f 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -3,11 +3,11 @@ unsigned char rpi_hevc_transform [] = {
-+ 3,
-+ 3,
-+ 232,
-+-128,
-++32,
-+ 0,
-+ 0,
-+ 0,
-+-20,
-++12,
-+ 248,
-+ 0,
-+ 136,
-+@@ -56,9 +56,9 @@ unsigned char rpi_hevc_transform [] = {
-+ 5,
-+ 232,
-+ 0,
-+-0,
-+ 8,
-+ 0,
-++0,
-+ 128,
-+ 69,
-+ 113,
-+@@ -108,8 +108,8 @@ unsigned char rpi_hevc_transform [] = {
-+ 128,
-+ 2,
-+ 0,
-+-248,
-+-62,
-++8,
-++2,
-+ 0,
-+ 128,
-+ 144,
-+@@ -123,13 +123,13 @@ unsigned char rpi_hevc_transform [] = {
-+ 3,
-+ 32,
-+ 8,
-+-16,
-++20,
-+ 0,
-+ 76,
-+ 254,
-+ 48,
-+ 192,
-+-9,
-++4,
-+ 4,
-+ 32,
-+ 8,
-+@@ -155,14 +155,46 @@ unsigned char rpi_hevc_transform [] = {
-+ 192,
-+ 41,
-+ 3,
-+-68,
-++70,
-++192,
-++80,
-++7,
-++164,
-++255,
-++36,
-++204,
-++96,
-++2,
-++0,
-++248,
-++62,
-++0,
-++3,
-++255,
-++55,
-++208,
-++120,
-++3,
-++224,
-++3,
-++190,
-++11,
-++16,
-++139,
-++246,
-++91,
-++0,
-++103,
-++90,
-++0,
-++70,
-+ 192,
-+ 80,
-+ 7,
-+ 164,
-+ 255,
-+ 36,
-+-220,
-++204,
-+ 96,
-+ 2,
-+ 0,
-+@@ -182,7 +214,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 16,
-+ 139,
-+ 246,
-+-83,
-++91,
-+ 0,
-+ 103,
-+ 90,
-+@@ -209,4 +241,374 @@ unsigned char rpi_hevc_transform [] = {
-+ 96,
-+ 90,
-+ 0,
-++169,
-++3,
-++3,
-++232,
-++32,
-++0,
-++0,
-++0,
-++12,
-++248,
-++0,
-++136,
-++0,
-++0,
-++192,
-++248,
-++0,
-++0,
-++64,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++248,
-++0,
-++168,
-++0,
-++0,
-++192,
-++248,
-++0,
-++0,
-++3,
-++232,
-++128,
-++0,
-++0,
-++0,
-++7,
-++232,
-++0,
-++2,
-++0,
-++0,
-++4,
-++232,
-++64,
-++0,
-++0,
-++0,
-++5,
-++232,
-++0,
-++8,
-++0,
-++0,
-++57,
-++239,
-++224,
-++247,
-++255,
-++255,
-++72,
-++192,
-++95,
-++207,
-++88,
-++122,
-++88,
-++124,
-++137,
-++64,
-++26,
-++64,
-++161,
-++64,
-++152,
-++64,
-++128,
-++144,
-++31,
-++0,
-++72,
-++232,
-++32,
-++0,
-++0,
-++0,
-++65,
-++232,
-++32,
-++0,
-++0,
-++0,
-++128,
-++144,
-++23,
-++0,
-++145,
-++64,
-++168,
-++64,
-++128,
-++144,
-++19,
-++0,
-++72,
-++232,
-++32,
-++0,
-++0,
-++0,
-++65,
-++232,
-++32,
-++0,
-++0,
-++0,
-++128,
-++144,
-++11,
-++0,
-++74,
-++232,
-++0,
-++8,
-++0,
-++0,
-++242,
-++140,
-++229,
-++192,
-++57,
-++239,
-++32,
-++8,
-++0,
-++0,
-++41,
-++3,
-++12,
-++248,
-++0,
-++128,
-++0,
-++0,
-++192,
-++8,
-++4,
-++0,
-++12,
-++248,
-++0,
-++132,
-++64,
-++0,
-++192,
-++8,
-++4,
-++0,
-++0,
-++96,
-++255,
-++159,
-++131,
-++255,
-++0,
-++232,
-++0,
-++4,
-++0,
-++0,
-++255,
-++159,
-++142,
-++255,
-++4,
-++255,
-++48,
-++204,
-++16,
-++3,
-++224,
-++251,
-++62,
-++0,
-++5,
-++255,
-++51,
-++204,
-++128,
-++3,
-++224,
-++251,
-++16,
-++0,
-++77,
-++254,
-++51,
-++204,
-++9,
-++4,
-++224,
-++251,
-++0,
-++0,
-++128,
-++64,
-++6,
-++232,
-++64,
-++0,
-++0,
-++0,
-++140,
-++248,
-++47,
-++0,
-++0,
-++0,
-++224,
-++99,
-++0,
-++0,
-++4,
-++254,
-++0,
-++144,
-++128,
-++2,
-++0,
-++8,
-++2,
-++0,
-++32,
-++247,
-++240,
-++207,
-++16,
-++3,
-++32,
-++247,
-++176,
-++207,
-++17,
-++3,
-++32,
-++247,
-++112,
-++207,
-++18,
-++3,
-++32,
-++247,
-++48,
-++207,
-++19,
-++3,
-++32,
-++247,
-++240,
-++206,
-++20,
-++3,
-++32,
-++247,
-++176,
-++206,
-++21,
-++3,
-++32,
-++247,
-++112,
-++206,
-++22,
-++3,
-++32,
-++247,
-++48,
-++206,
-++23,
-++3,
-++32,
-++247,
-++240,
-++205,
-++24,
-++3,
-++32,
-++247,
-++176,
-++205,
-++25,
-++3,
-++32,
-++247,
-++112,
-++205,
-++26,
-++3,
-++32,
-++247,
-++48,
-++205,
-++27,
-++3,
-++32,
-++247,
-++240,
-++204,
-++28,
-++3,
-++32,
-++247,
-++176,
-++204,
-++29,
-++3,
-++32,
-++247,
-++112,
-++204,
-++30,
-++3,
-++32,
-++247,
-++48,
-++204,
-++31,
-++3,
-++5,
-++255,
-++51,
-++204,
-++128,
-++3,
-++224,
-++251,
-++16,
-++0,
-++77,
-++254,
-++51,
-++204,
-++9,
-++4,
-++224,
-++251,
-++0,
-++0,
-++0,
-++237,
-++0,
-++4,
-++0,
-++0,
-++140,
-++248,
-++47,
-++0,
-++0,
-++0,
-++224,
-++99,
-++0,
-++0,
-++90,
-++0,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index 5e2728d..1e389c7 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -58,13 +58,6 @@
-+ #
-+ #
-+ 
-+-test_add:
-+-  vldh HX(0,0),(r0)
-+-  vadd HX(0,0),HX(0,0),10
-+-  vsth HX(0,0),(r0)
-+-  mov r0,7 # return value
-+-  b lr
-+-
-+ # Columns are transformed first
-+ #
-+ # Store top left half of transMatrix2 in
-+@@ -79,7 +72,7 @@ test_add:
-+ #
-+ 
-+ 
-+-# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num)
-++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+ # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+ # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+ # num: number of 16x16 transforms to be done
-+@@ -87,17 +80,17 @@ test_add:
-+ hevc_trans_16x16:
-+   push r6-r15, lr # TODO cut down number of used registers
-+ 
-+-  mov r3, 2*32*2 # Twice Stride of transMatrix2 in bytes
-+-  vld HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-++  mov r3, 16*2 # Stride of transMatrix2 in bytes
-++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+   # Now use r0 to describe which matrix we are working on.
-+   # Allows us to prefetch the next block of coefficients for efficiency.
-+   mov r0,0 # This describes the location where we read our coefficients from
-+-  mov r3,16*2 # Stride of coefficients in bytes
-++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+   mov r7,16*16*2 # Total block size
-+   mov r8,64*16 # Value used to swap from current to next VRF location
-+   vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+   mov r4,64 # Constant used for rounding first pass
-+-  mov r5,1<<19 # Constant used for rounding second pass
-++  mov r5,1<<11 # Constant used for rounding second pass
-+ 
-+   # At start of block r0,r1 point to the current block (that has already been loaded)
-+ block_loop:
-+@@ -113,12 +106,12 @@ block_loop:
-+   vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+   #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+   vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+-  vmov VX(0,0++), HX(0++,32) REP 16          # For simplicity transpose this back to the original position
-++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+ 
-+   bl col_trans_16
-+-  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+-  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+-  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-++  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+ 
-+   # Save results - note there has been a transposition during the processing so we save columns
-+   vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+@@ -132,16 +125,136 @@ block_loop:
-+ 
-+ # r1,r2,r3 r7,r8 should be preserved
-+ # HX(0++,0)+r0 is the block to be transformed
-+-# HX(32++,0) is the 16x16 matrix of transform coefficients
-++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+ # Use HY(48,0) for intermediate results
-+ # r0 can be used, but should be returned to its original value at the end
-+ col_trans_16:
-+-  add r4,r0,16 # Final value for this loop
-++  add r6,r0,16 # Final value for this loop
-+ col_trans_16_loop:
-+   # First compute partial products for a single column
-+-  vmul32s VY(48,0++), VX(0,0)+r0, VX(32,0++) REP 16
-++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+   # Then sum up the results and place back
-+   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+-  addcmpblt r0,1,r4,col_trans_16_loop
-++  addcmpblt r0,1,r6,col_trans_16_loop
-+   sub r0,16  # but r0 back to its original value
-+   b lr
-++
-++col_trans_odd_16:
-++  add r6,r0,16 # Final value for this loop
-++col_trans_odd_16_loop:
-++  # First compute partial products for a single column
-++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-++  # Then sum up the results and place back
-++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-++  addcmpblt r0,1,r6,col_trans_odd_16_loop
-++  sub r0,16  # but r0 back to its original value
-++  b lr
-++
-++
-++test_add:
-++  vldh HX(0,0),(r0)
-++  vadd HX(0,0),HX(0,0),10
-++  vsth HX(0,0),(r0)
-++  mov r0,7 # return value
-++  b lr
-++
-++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-++# num: number of 16x16 transforms to be done
-++#
-++hevc_trans_32x32:
-++  push r6-r15, lr # TODO cut down number of used registers
-++
-++  # Fetch transform matrices
-++  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-++  add r0, 16*16*2
-++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-++
-++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-++  mov r7, 16*16*2 # Total block size
-++  mov r4, 64 # Constant used for rounding first pass
-++  mov r5, 1<<11 # Constant used for rounding second pass
-++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-++  # set r8 to 32byte aligned stack pointer
-++  add r8,sp,31
-++  lsr r8,5
-++  lsl r8,5
-++  mov r9,r8  # Backup of the temporary storage
-++  mov r10,r1 # Backup of the coefficient buffer
-++block_loop32:
-++
-++  # COLUMN TRANSFORM
-++  # Transform the first 16 columns
-++  mov r1,r10  # Input Coefficient buffer
-++  mov r8,r9   # Output temporary storage
-++  bl trans32
-++  # Transform the second 16 columns
-++  add r8,32
-++  add r1,32
-++  bl trans32
-++
-++  # ROW TRANSFORM
-++  mov r1,r9  # Input temporary storage
-++  mov r8,r10   # Output Coefficient buffer
-++  bl trans32
-++  # Transform the second 16 columns
-++  add r8,32
-++  add r1,32
-++  bl trans32
-++
-++  add r10, 32*32*2 # move onto next block of coefficients
-++  addcmpbgt r2,-1,0,block_loop32
-++
-++  add sp,sp,32*32*2+32 # Restore stack
-++
-++  pop r6-r15, pc
-++
-++trans32:
-++  # We can no longer afford the VRF space to do prefetching when doing 32x32
-++  # Fetch the even rows
-++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-++  # Fetch the odd rows
-++  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-++
-++  # Transform the even rows using even matrix
-++  mov r0, 0 # Even rows
-++  bl col_trans_16
-++
-++  # Now transform the odd rows using odd matrix
-++  mov r0, 64*16 # Odd rows
-++  bl col_trans_odd_16
-++
-++  # Now apply butterfly to compute the first 16 results
-++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-++  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-++  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-++  # 16bit results now in HX(48,32)
-++  mov r0,r8
-++  mov r6,32*2
-++  vsth VX(48,32++),(r0+=r6) REP 16
-++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
-++
-++  # Now apply butterfly to compute the second 16 results (in reverse order)
-++  vsub HY(63,0),HY(0,0),HY(16,0)
-++  vsub HY(62,0),HY(0,0),HY(17,0)
-++  vsub HY(61,0),HY(0,0),HY(18,0)
-++  vsub HY(60,0),HY(0,0),HY(19,0)
-++  vsub HY(59,0),HY(0,0),HY(20,0)
-++  vsub HY(58,0),HY(0,0),HY(21,0)
-++  vsub HY(57,0),HY(0,0),HY(22,0)
-++  vsub HY(56,0),HY(0,0),HY(23,0)
-++  vsub HY(55,0),HY(0,0),HY(24,0)
-++  vsub HY(54,0),HY(0,0),HY(25,0)
-++  vsub HY(53,0),HY(0,0),HY(26,0)
-++  vsub HY(52,0),HY(0,0),HY(27,0)
-++  vsub HY(51,0),HY(0,0),HY(28,0)
-++  vsub HY(50,0),HY(0,0),HY(29,0)
-++  vsub HY(49,0),HY(0,0),HY(30,0)
-++  vsub HY(48,0),HY(0,0),HY(31,0)
-++  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-++  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-++  add r0,r8,16*32*2 # Move to 16th row
-++  vsth VX(48,32++),(r0+=r6) REP 16
-++  b lr
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index b1f50ee..d720546 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -3,6 +3,7 @@
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+ #define RPI_USE_VCSM
-+ #define RPI_TIME_TOTAL_QPU
-++#define RPI_TIME_TOTAL_VPU
-+ 
-+ #include <stdio.h>
-+ #include <stdlib.h>
-+@@ -48,10 +49,47 @@ typedef int int32_t;
-+ #define QPU_CODE_SIZE 2048
-+ #define VPU_CODE_SIZE 2048
-+ 
-++const short rpi_transMatrix2even[32][16] = { // Even rows first
-++{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
-++{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
-++{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
-++{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
-++{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
-++{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
-++{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
-++{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
-++{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
-++{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
-++{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
-++{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
-++{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
-++{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
-++{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
-++{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
-++// Odd rows
-++{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
-++{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-++{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
-++{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
-++{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
-++{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
-++{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
-++{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
-++{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
-++{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
-++{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
-++{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
-++{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
-++{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
-++{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
-++{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
-++};
-++
-+ struct GPU
-+ {
-+   unsigned int qpu_code[QPU_CODE_SIZE];
-+   unsigned int vpu_code[VPU_CODE_SIZE];
-++  short transMatrix2even[16*16];
-+   int open_count; // Number of allocated video buffers
-+   unsigned int vc_handle; // Handle of this memory
-+   int      mb; // Mailbox handle
-+@@ -123,6 +161,8 @@ static int gpu_init(volatile struct GPU **gpu) {
-+     assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-+   }
-++  // And the transform coefficients
-++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
-+ 
-+   return 0;
-+ }
-+@@ -274,11 +314,43 @@ unsigned int vpu_get_fn(void) {
-+   return gpu->vc + offsetof(struct GPU,vpu_code);
-+ }
-+ 
-++unsigned int vpu_get_constants(void) {
-++  if (gpu==NULL) {
-++    gpu_lock();
-++    gpu_unlock();
-++  }
-++  return gpu->vc + offsetof(struct GPU,transMatrix2even);
-++}
-++
-+ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-+ {
-+   unsigned r;
-++#ifdef RPI_TIME_TOTAL_VPU
-++  static int last_time=0;
-++  static long long on_time=0;
-++  static long long off_time=0;
-++  int start_time;
-++  int end_time;
-++  static int count=0;
-++  static long long countr2=0;
-++#endif
-+   gpu_lock();
-++#ifdef RPI_TIME_TOTAL_VPU
-++  start_time = Microseconds();
-++  if (last_time==0)
-++    last_time = start_time;
-++  off_time += start_time-last_time;
-++#endif
-+   r = execute_code(gpu->mb, code, r0, r1, r2, r3, r4, r5);
-++#ifdef RPI_TIME_TOTAL_VPU
-++  end_time = Microseconds();
-++  last_time = end_time;
-++  on_time += end_time - start_time;
-++  count++;
-++  countr2 += r2;
-++  if ((count&0x7f)==0)
-++    printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-+   gpu_unlock();
-+   return r;
-+ }
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 4e3c35c..814fc3c 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -34,6 +34,7 @@ extern unsigned int qpu_get_fn(int num);
-+ 
-+ // VPU specific functions
-+ extern unsigned int vpu_get_fn(void);
-++extern unsigned int vpu_get_constants(void);
-+ extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+ 
-+ // Simple test of shader code
-+-- 
-+2.5.0
-+
-+
-+From 1f1b223bd911a88726aa2c2f56334b15b421d7fa Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 5 May 2015 09:41:23 +0100
-+Subject: [PATCH 05/68] Fixed deblocking
-+
-+---
-+ libavcodec/hevc.c | 20 +++++++++++++++++---
-+ 1 file changed, 17 insertions(+), 3 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 4aacb60..94fdda6 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2387,8 +2387,9 @@ static void rpi_execute_transform(HEVCContext *s)
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-    //gpu_cache_flush(&s->coeffs_buf[i]);
-++    gpu_cache_flush(&s->coeffs_buf[i]);
-+     vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
-++    gpu_cache_flush(&s->coeffs_buf[i]);
-+ 
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+@@ -2427,6 +2428,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-+ #ifdef RPI
-++    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-+     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-+ #endif
-+ 
-+@@ -2460,9 +2462,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ #ifdef RPI
-+-        if (1 || x_ctb + ctb_size >= s->ps.sps->width) { // TODO watch out for deblocking!
-++        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-++            int x;
-++            // Transform all blocks
-+             rpi_execute_transform(s);
-++            // Perform intra prediction and residual reconstruction
-+             rpi_execute_pred_cmds(s);
-++            // Perform deblocking for CTBs in this row
-++            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
-++                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
-++            }
-++            start_ctb_x = 0;
-+         }
-+ #endif
-+         if (more_data < 0) {
-+@@ -2473,6 +2483,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         ctb_addr_ts++;
-+         ff_hevc_save_states(s, ctb_addr_ts);
-++#ifdef RPI
-++        if (s->enable_rpi)
-++            continue;
-++#endif
-+         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
-+     }
-+ 
-+@@ -3217,7 +3231,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+     for(i = 0; i < 4; i++) {
-+-        gpu_malloc_uncached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-++        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-+         s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
-+         if (!s->coeffs_buf_arm[i])
-+             goto fail;
-+-- 
-+2.5.0
-+
-+
-+From a32f8972fedc38dcf887f8f2899e8843efd6324a Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 5 May 2015 11:32:30 +0100
-+Subject: [PATCH 06/68] Added 32x32 transform
-+
-+---
-+ libavcodec/hevc.c               |   8 +-
-+ libavcodec/hevc_cabac.c         |   4 +-
-+ libavcodec/rpi_hevc_transform.h | 200 +++++++++++++++++-----------------------
-+ libavcodec/rpi_hevc_transform.s | 102 ++++++++++----------
-+ libavcodec/rpi_qpu.c            |   4 +-
-+ 5 files changed, 148 insertions(+), 170 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 94fdda6..fbbd30f 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2387,9 +2387,11 @@ static void rpi_execute_transform(HEVCContext *s)
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-    gpu_cache_flush(&s->coeffs_buf[i]);
-+-    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[i].vc, s->num_coeffs[i] >> 8, 0, 0, 0);
-+-    gpu_cache_flush(&s->coeffs_buf[i]);
-++    gpu_cache_flush(&s->coeffs_buf[2]);
-++    gpu_cache_flush(&s->coeffs_buf[3]);
-++    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
-++    gpu_cache_flush(&s->coeffs_buf[2]);
-++    gpu_cache_flush(&s->coeffs_buf[3]);
-+ 
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index a7561bd..3e6dabf 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1031,7 +1031,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     int vshift = s->ps.sps->vshift[c_idx];
-+     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-+                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-+-    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size==4;
-++#ifdef RPI
-++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size>=4;
-++#endif
-+     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+     uint8_t significant_coeff_group_flag[8][8] = {{0}};
-+     int explicit_rdpcm_flag = 0;
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index c0c279f..6d772d7 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -1,6 +1,10 @@
-+ unsigned char rpi_hevc_transform [] = {
-+ 169,
-+ 3,
-++62,
-++64,
-++79,
-++64,
-+ 3,
-+ 232,
-+ 32,
-+@@ -17,6 +21,22 @@ unsigned char rpi_hevc_transform [] = {
-+ 248,
-+ 0,
-+ 0,
-++64,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++248,
-++0,
-++168,
-++0,
-++0,
-++192,
-++248,
-++0,
-++0,
-+ 0,
-+ 96,
-+ 3,
-+@@ -79,7 +99,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 70,
-+ 128,
-+ 144,
-+-39,
-++40,
-+ 0,
-+ 4,
-+ 255,
-+@@ -113,7 +133,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 128,
-+ 144,
-+-22,
-++23,
-+ 0,
-+ 4,
-+ 255,
-+@@ -153,6 +173,8 @@ unsigned char rpi_hevc_transform [] = {
-+ 140,
-+ 211,
-+ 192,
-++34,
-++31,
-+ 41,
-+ 3,
-+ 70,
-+@@ -195,7 +217,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 255,
-+ 36,
-+ 204,
-+-96,
-++224,
-+ 2,
-+ 0,
-+ 248,
-+@@ -219,62 +241,10 @@ unsigned char rpi_hevc_transform [] = {
-+ 103,
-+ 90,
-+ 0,
-+-8,
-+-240,
-+-0,
-+-128,
-+-128,
-+-3,
-+-0,
-+-247,
-+-32,
-+-128,
-+-10,
-+-4,
-+-136,
-+-240,
-+-32,
-+-0,
-+-128,
-+-3,
-+-112,
-+-96,
-+-90,
-+-0,
-+-169,
-+-3,
-+-3,
-+-232,
-+-32,
-+-0,
-+-0,
-+-0,
-+-12,
-+-248,
-+-0,
-+-136,
-+-0,
-+-0,
-+-192,
-+-248,
-+-0,
-+-0,
-++225,
-++64,
-++242,
-+ 64,
-+-232,
-+-0,
-+-2,
-+-0,
-+-0,
-+-12,
-+-248,
-+-0,
-+-168,
-+-0,
-+-0,
-+-192,
-+-248,
-+-0,
-+-0,
-+ 3,
-+ 232,
-+ 128,
-+@@ -287,18 +257,6 @@ unsigned char rpi_hevc_transform [] = {
-+ 2,
-+ 0,
-+ 0,
-+-4,
-+-232,
-+-64,
-+-0,
-+-0,
-+-0,
-+-5,
-+-232,
-+-0,
-+-8,
-+-0,
-+-0,
-+ 57,
-+ 239,
-+ 224,
-+@@ -317,18 +275,26 @@ unsigned char rpi_hevc_transform [] = {
-+ 64,
-+ 26,
-+ 64,
-++4,
-++232,
-++64,
-++0,
-++0,
-++0,
-++149,
-++96,
-+ 161,
-+ 64,
-+ 152,
-+ 64,
-+ 128,
-+ 144,
-+-31,
-++35,
-+ 0,
-+ 72,
-+ 232,
-+-32,
-+ 0,
-++4,
-+ 0,
-+ 0,
-+ 65,
-+@@ -339,8 +305,16 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 128,
-+ 144,
-+-23,
-++27,
-++0,
-++4,
-++232,
-++0,
-++8,
-+ 0,
-++0,
-++69,
-++96,
-+ 145,
-+ 64,
-+ 168,
-+@@ -351,8 +325,8 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 72,
-+ 232,
-+-32,
-+ 0,
-++4,
-+ 0,
-+ 0,
-+ 65,
-+@@ -373,7 +347,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 242,
-+ 140,
-+-229,
-++221,
-+ 192,
-+ 57,
-+ 239,
-+@@ -383,6 +357,8 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 41,
-+ 3,
-++239,
-++3,
-+ 12,
-+ 248,
-+ 0,
-+@@ -390,7 +366,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 0,
-+ 192,
-+-8,
-++248,
-+ 4,
-+ 0,
-+ 12,
-+@@ -400,14 +376,14 @@ unsigned char rpi_hevc_transform [] = {
-+ 64,
-+ 0,
-+ 192,
-+-8,
-++248,
-+ 4,
-+ 0,
-+ 0,
-+ 96,
-+ 255,
-+ 159,
-+-131,
-++154,
-+ 255,
-+ 0,
-+ 232,
-+@@ -417,7 +393,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 255,
-+ 159,
-+-142,
-++165,
-+ 255,
-+ 4,
-+ 255,
-+@@ -429,7 +405,7 @@ unsigned char rpi_hevc_transform [] = {
-+ 251,
-+ 62,
-+ 0,
-+-5,
-++4,
-+ 255,
-+ 51,
-+ 204,
-+@@ -439,15 +415,15 @@ unsigned char rpi_hevc_transform [] = {
-+ 251,
-+ 16,
-+ 0,
-+-77,
-++76,
-+ 254,
-+ 51,
-+ 204,
-+-9,
-+-4,
-++128,
-++3,
-+ 224,
-+ 251,
-+-0,
-++20,
-+ 0,
-+ 128,
-+ 64,
-+@@ -467,16 +443,6 @@ unsigned char rpi_hevc_transform [] = {
-+ 99,
-+ 0,
-+ 0,
-+-4,
-+-254,
-+-0,
-+-144,
-+-128,
-+-2,
-+-0,
-+-8,
-+-2,
-+-0,
-+ 32,
-+ 247,
-+ 240,
-+@@ -488,92 +454,92 @@ unsigned char rpi_hevc_transform [] = {
-+ 176,
-+ 207,
-+ 17,
-+-3,
-++19,
-+ 32,
-+ 247,
-+ 112,
-+ 207,
-+ 18,
-+-3,
-++35,
-+ 32,
-+ 247,
-+ 48,
-+ 207,
-+ 19,
-+-3,
-++51,
-+ 32,
-+ 247,
-+ 240,
-+ 206,
-+ 20,
-+-3,
-++67,
-+ 32,
-+ 247,
-+ 176,
-+ 206,
-+ 21,
-+-3,
-++83,
-+ 32,
-+ 247,
-+ 112,
-+ 206,
-+ 22,
-+-3,
-++99,
-+ 32,
-+ 247,
-+ 48,
-+ 206,
-+ 23,
-+-3,
-++115,
-+ 32,
-+ 247,
-+ 240,
-+ 205,
-+ 24,
-+-3,
-++131,
-+ 32,
-+ 247,
-+ 176,
-+ 205,
-+ 25,
-+-3,
-++147,
-+ 32,
-+ 247,
-+ 112,
-+ 205,
-+ 26,
-+-3,
-++163,
-+ 32,
-+ 247,
-+ 48,
-+ 205,
-+ 27,
-+-3,
-++179,
-+ 32,
-+ 247,
-+ 240,
-+ 204,
-+ 28,
-+-3,
-++195,
-+ 32,
-+ 247,
-+ 176,
-+ 204,
-+ 29,
-+-3,
-++211,
-+ 32,
-+ 247,
-+ 112,
-+ 204,
-+ 30,
-+-3,
-++227,
-+ 32,
-+ 247,
-+ 48,
-+ 204,
-+ 31,
-+-3,
-+-5,
-++243,
-++4,
-+ 255,
-+ 51,
-+ 204,
-+@@ -583,20 +549,20 @@ unsigned char rpi_hevc_transform [] = {
-+ 251,
-+ 16,
-+ 0,
-+-77,
-++76,
-+ 254,
-+ 51,
-+ 204,
-+-9,
-+-4,
-++128,
-++3,
-+ 224,
-+ 251,
-+-0,
-++20,
-+ 0,
-+ 0,
-+ 237,
-++32,
-+ 0,
-+-4,
-+ 0,
-+ 0,
-+ 140,
-+@@ -609,6 +575,6 @@ unsigned char rpi_hevc_transform [] = {
-+ 99,
-+ 0,
-+ 0,
-+-90,
-+-0,
-++111,
-++3,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index 1e389c7..afdb32a 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -76,12 +76,19 @@
-+ # transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+ # coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+ # num: number of 16x16 transforms to be done
-++# coeffs32
-++# num32: number of 32x32 transforms
-+ #
-+ hevc_trans_16x16:
-+   push r6-r15, lr # TODO cut down number of used registers
-+-
-++  mov r14,r3 # coeffs32
-++  mov r15,r4 # num32
-+   mov r3, 16*2 # Stride of transMatrix2 in bytes
-+   vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-++
-++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-++
-+   # Now use r0 to describe which matrix we are working on.
-+   # Allows us to prefetch the next block of coefficients for efficiency.
-+   mov r0,0 # This describes the location where we read our coefficients from
-+@@ -121,6 +128,10 @@ block_loop:
-+   add r1,r7
-+ 
-+   addcmpbgt r2,-1,0,block_loop
-++
-++  # Now go and do any 32x32 transforms
-++  b hevc_trans_32x32
-++
-+   pop r6-r15, pc
-+ 
-+ # r1,r2,r3 r7,r8 should be preserved
-+@@ -136,26 +147,18 @@ col_trans_16_loop:
-+   # Then sum up the results and place back
-+   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+   addcmpblt r0,1,r6,col_trans_16_loop
-+-  sub r0,16  # but r0 back to its original value
-++  sub r0,16  # put r0 back to its original value
-+   b lr
-+ 
-+ col_trans_odd_16:
-+   add r6,r0,16 # Final value for this loop
-+ col_trans_odd_16_loop:
-+   # First compute partial products for a single column
-+-  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+   # Then sum up the results and place back
-+   vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+   addcmpblt r0,1,r6,col_trans_odd_16_loop
-+-  sub r0,16  # but r0 back to its original value
-+-  b lr
-+-
-+-
-+-test_add:
-+-  vldh HX(0,0),(r0)
-+-  vadd HX(0,0),HX(0,0),10
-+-  vsth HX(0,0),(r0)
-+-  mov r0,7 # return value
-++  sub r0,16  # put r0 back to its original value
-+   b lr
-+ 
-+ # hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+@@ -164,18 +167,17 @@ test_add:
-+ # num: number of 16x16 transforms to be done
-+ #
-+ hevc_trans_32x32:
-+-  push r6-r15, lr # TODO cut down number of used registers
-++  mov r1,r14 # coeffs
-++  mov r2,r15 # num
-+ 
-+-  # Fetch transform matrices
-+-  mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+-  vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+-  add r0, 16*16*2
-+-  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-++  # Fetch odd transform matrix
-++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-++  #add r0, 16*16*2
-++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+ 
-+   mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+   mov r7, 16*16*2 # Total block size
-+-  mov r4, 64 # Constant used for rounding first pass
-+-  mov r5, 1<<11 # Constant used for rounding second pass
-+   sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-+   # set r8 to 32byte aligned stack pointer
-+   add r8,sp,31
-+@@ -186,21 +188,27 @@ hevc_trans_32x32:
-+ block_loop32:
-+ 
-+   # COLUMN TRANSFORM
-++  mov r4, 64 # Constant used for rounding first pass
-++  mov r5, 9 # left shift used for rounding first pass
-++
-+   # Transform the first 16 columns
-+   mov r1,r10  # Input Coefficient buffer
-+   mov r8,r9   # Output temporary storage
-+   bl trans32
-+   # Transform the second 16 columns
-+-  add r8,32
-++  add r8,32*16*2
-+   add r1,32
-+   bl trans32
-+ 
-+   # ROW TRANSFORM
-++  mov r4, 1<<11 # Constant used for rounding second pass
-++  mov r5, 4 # left shift used for rounding second pass
-++
-+   mov r1,r9  # Input temporary storage
-+   mov r8,r10   # Output Coefficient buffer
-+   bl trans32
-+   # Transform the second 16 columns
-+-  add r8,32
-++  add r8,32*16*2
-+   add r1,32
-+   bl trans32
-+ 
-+@@ -212,11 +220,12 @@ block_loop32:
-+   pop r6-r15, pc
-+ 
-+ trans32:
-++  push lr
-+   # We can no longer afford the VRF space to do prefetching when doing 32x32
-+   # Fetch the even rows
-+-  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-++  vldh HX(0++,0),(r1 += r3) REP 16
-+   # Fetch the odd rows
-+-  vldh HX(16++,0)+r0,64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+ 
-+   # Transform the even rows using even matrix
-+   mov r0, 0 # Even rows
-+@@ -228,33 +237,32 @@ trans32:
-+ 
-+   # Now apply butterfly to compute the first 16 results
-+   vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+-  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-+-  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+   # 16bit results now in HX(48,32)
-+   mov r0,r8
-+   mov r6,32*2
-+   vsth VX(48,32++),(r0+=r6) REP 16
-+-  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # Store transposed
-+ 
-+   # Now apply butterfly to compute the second 16 results (in reverse order)
-+-  vsub HY(63,0),HY(0,0),HY(16,0)
-+-  vsub HY(62,0),HY(0,0),HY(17,0)
-+-  vsub HY(61,0),HY(0,0),HY(18,0)
-+-  vsub HY(60,0),HY(0,0),HY(19,0)
-+-  vsub HY(59,0),HY(0,0),HY(20,0)
-+-  vsub HY(58,0),HY(0,0),HY(21,0)
-+-  vsub HY(57,0),HY(0,0),HY(22,0)
-+-  vsub HY(56,0),HY(0,0),HY(23,0)
-+-  vsub HY(55,0),HY(0,0),HY(24,0)
-+-  vsub HY(54,0),HY(0,0),HY(25,0)
-+-  vsub HY(53,0),HY(0,0),HY(26,0)
-+-  vsub HY(52,0),HY(0,0),HY(27,0)
-+-  vsub HY(51,0),HY(0,0),HY(28,0)
-+-  vsub HY(50,0),HY(0,0),HY(29,0)
-+-  vsub HY(49,0),HY(0,0),HY(30,0)
-+-  vsub HY(48,0),HY(0,0),HY(31,0)
-+-  vadd HY(48++,0),HY(48++,0),r4 REP 32   # add on rounding,
-+-  vasl HY(48++,0),HY(48++,0),9 REP 32    # shift down by 7, and saturate
-+-  add r0,r8,16*32*2 # Move to 16th row
-++  vsub HY(63,0),HY(0 ,0),HY(16,0)
-++  vsub HY(62,0),HY(1 ,0),HY(17,0)
-++  vsub HY(61,0),HY(2 ,0),HY(18,0)
-++  vsub HY(60,0),HY(3 ,0),HY(19,0)
-++  vsub HY(59,0),HY(4 ,0),HY(20,0)
-++  vsub HY(58,0),HY(5 ,0),HY(21,0)
-++  vsub HY(57,0),HY(6 ,0),HY(22,0)
-++  vsub HY(56,0),HY(7 ,0),HY(23,0)
-++  vsub HY(55,0),HY(8 ,0),HY(24,0)
-++  vsub HY(54,0),HY(9 ,0),HY(25,0)
-++  vsub HY(53,0),HY(10,0),HY(26,0)
-++  vsub HY(52,0),HY(11,0),HY(27,0)
-++  vsub HY(51,0),HY(12,0),HY(28,0)
-++  vsub HY(50,0),HY(13,0),HY(29,0)
-++  vsub HY(49,0),HY(14,0),HY(30,0)
-++  vsub HY(48,0),HY(15,0),HY(31,0)
-++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-++  add r0,r8,32
-+   vsth VX(48,32++),(r0+=r6) REP 16
-+-  b lr
-++  pop pc
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index d720546..12ad5fb 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -89,7 +89,7 @@ struct GPU
-+ {
-+   unsigned int qpu_code[QPU_CODE_SIZE];
-+   unsigned int vpu_code[VPU_CODE_SIZE];
-+-  short transMatrix2even[16*16];
-++  short transMatrix2even[16*16*2];
-+   int open_count; // Number of allocated video buffers
-+   unsigned int vc_handle; // Handle of this memory
-+   int      mb; // Mailbox handle
-+@@ -162,7 +162,7 @@ static int gpu_init(volatile struct GPU **gpu) {
-+     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-+   }
-+   // And the transform coefficients
-+-  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, 16*16*sizeof(short));
-++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+ 
-+   return 0;
-+ }
-+-- 
-+2.5.0
-+
-+
-+From 223fee0c814602a2aa5611c21fe052e6b6e063c1 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 5 May 2015 16:57:03 +0100
-+Subject: [PATCH 07/68] Clear coefficients in advance
-+
-+---
-+ libavcodec/hevc.c               | 129 ++++++++++++++++++++++++++++------------
-+ libavcodec/hevc.h               |   6 +-
-+ libavcodec/hevc_cabac.c         |   7 ++-
-+ libavcodec/rpi_hevc_transform.h |  50 ++++++++++++++++
-+ libavcodec/rpi_hevc_transform.s |  16 +++++
-+ 5 files changed, 168 insertions(+), 40 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index fbbd30f..12e66a6 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -41,6 +41,8 @@
-+ 
-+ #ifdef RPI
-+ #include "rpi_qpu.h"
-++// For some unknown reason, the code seems to crash if I do a late malloc
-++#define EARLY_MALLOC
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -59,6 +61,20 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ /* free everything allocated  by pic_arrays_init() */
-+ static void pic_arrays_free(HEVCContext *s)
-+ {
-++#ifdef RPI
-++#ifdef EARLY_MALLOC
-++#else
-++    printf("pic_arrays_free\n");
-++    if (s->coeffs_buf_arm[0]) {
-++      gpu_free(&s->coeffs_buf_default);
-++      s->coeffs_buf_arm[0] = 0;
-++    }
-++    if (s->coeffs_buf_arm[2]) {
-++      gpu_free(&s->coeffs_buf_accelerated);
-++      s->coeffs_buf_arm[2] = 0;
-++    }
-++#endif
-++#endif
-+     av_freep(&s->sao);
-+     av_freep(&s->deblock);
-+ 
-+@@ -95,6 +111,28 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+     int ctb_count        = sps->ctb_width * sps->ctb_height;
-+     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
-+ 
-++#ifdef RPI
-++#ifdef EARLY_MALLOC
-++#else
-++    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
-++    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-++    printf("pic_arrays_init\n");
-++    printf("Allocated %d\n",coefs_per_row);
-++    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-++    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-++    if (!s->coeffs_buf_arm[0])
-++        goto fail;
-++    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-++    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-++    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-++    if (!s->coeffs_buf_arm[2])
-++        goto fail;
-++    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-++    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-++    printf("Done\n");
-++#endif
-++#endif
-++
-+     s->bs_width  = (width  >> 2) + 1;
-+     s->bs_height = (height >> 2) + 1;
-+ 
-+@@ -2387,11 +2425,10 @@ static void rpi_execute_transform(HEVCContext *s)
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-    gpu_cache_flush(&s->coeffs_buf[2]);
-+-    gpu_cache_flush(&s->coeffs_buf[3]);
-+-    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf[2].vc, s->num_coeffs[2] >> 8, s->coeffs_buf[3].vc, s->num_coeffs[3] >> 10, 0);
-+-    gpu_cache_flush(&s->coeffs_buf[2]);
-+-    gpu_cache_flush(&s->coeffs_buf[3]);
-++
-++    gpu_cache_flush(&s->coeffs_buf_accelerated);
-++    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-++    //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+ 
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+@@ -2413,7 +2450,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-+       } else {
-++          int trafo_size = 1 << cmd->size;
-+           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-++          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-+       }
-+   }
-+   s->num_pred_cmds = 0;
-+@@ -3158,10 +3197,18 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->unif_mv_cmds);
-+     av_freep(&s->unif_xfm_cmds);
-+     av_freep(&s->univ_pred_cmds);
-+-    for(i = 0; i < 4; i++) {
-+-        gpu_free(&s->coeffs_buf[i]);
-++
-++#ifdef EARLY_MALLOC
-++    if (s->coeffs_buf_arm[0]) {
-++      gpu_free(&s->coeffs_buf_default);
-++      s->coeffs_buf_arm[0] = 0;
-++    }
-++    if (s->coeffs_buf_arm[2]) {
-++      gpu_free(&s->coeffs_buf_accelerated);
-++      s->coeffs_buf_arm[2] = 0;
-+     }
-+ #endif
-++#endif
-+ 
-+     for (i = 0; i < 3; i++) {
-+         av_freep(&s->sao_pixel_buffer_h[i]);
-+@@ -3209,6 +3256,16 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     return 0;
-+ }
-+ 
-++#ifdef RPI
-++static av_cold void memclear16(int16_t *p, int n)
-++{
-++  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-++  //int i;
-++  //for(i=0;i<n;i++)
-++  //  p[i] = 0;
-++}
-++#endif
-++
-+ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+ {
-+     HEVCContext *s = avctx->priv_data;
-+@@ -3232,37 +3289,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+-    for(i = 0; i < 4; i++) {
-+-        gpu_malloc_cached(sizeof(int16_t)*RPI_MAX_XFM_CMDS*16, &s->coeffs_buf[i]); // TODO slim this down and share across sizes
-+-        s->coeffs_buf_arm[i] = (int16_t*) s->coeffs_buf[i].arm;
-+-        if (!s->coeffs_buf_arm[i])
-+-            goto fail;
-+-    }
-+-    s->enable_rpi = 0;
-+ 
-+-    // A little test program
-+-    /*{
-+-      GPU_MEM_PTR_T p;
-+-      int err = gpu_malloc_cached(16, &p);
-+-      short *q = (short *)p.arm;
-+-      int i;
-+-      int r;
-+-      printf("Allocated memory %d ARM 0x%x, VC 0x%x, Code 0x%x\n",err,(int)p.arm,p.vc,(int)vpu_get_fn());
-+-      printf("Allocated memory %d ARM 0x%x, VC 0x%x\n",err,(int)p.arm,p.vc);
-+-      printf("Preparing data %p\n",q);
-+-      for(i=0;i<16;i++)
-+-        q[i] = i;
-+-      printf("Flush cache\n");
-+-      gpu_cache_flush(&p);
-+-      printf("Executing code\n");
-+-      r = vpu_execute_code( vpu_get_fn(), p.vc, 0, 0, 0, 0, 0);
-+-      printf("Return value %d (",r);
-+-      for(i=0;i<16;i++)
-+-        printf("%d ",q[i]);
-+-      printf(")\n");
-+-      gpu_free(&p);
-+-      goto fail; // Early out
-+-    }*/
-++    s->coeffs_buf_arm[0] = 0;
-++    s->coeffs_buf_arm[2] = 0;
-++
-++#ifdef EARLY_MALLOC
-++    int coeffs_in_ctb = 64*64;
-++    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-++    printf("Allocated %d\n",coefs_per_row);
-++    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-++    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-++    if (!s->coeffs_buf_arm[0])
-++        goto fail;
-++    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-++    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-++    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-++    if (!s->coeffs_buf_arm[2])
-++        goto fail;
-++    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-++    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-++    printf("Done\n");
-++    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-++    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-++    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-++    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-++    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-++    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-++#endif
-++
-++    s->enable_rpi = 0;
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index f201817..ca7c2aa 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -887,8 +887,12 @@ typedef struct HEVCContext {
-+     HEVCMvCmd *unif_mv_cmds;
-+     HEVCXfmCmd *unif_xfm_cmds;
-+     HEVCPredCmd *univ_pred_cmds;
-+-    GPU_MEM_PTR_T coeffs_buf[4];
-++    int buf_width;
-++    GPU_MEM_PTR_T coeffs_buf_default;
-++    GPU_MEM_PTR_T coeffs_buf_accelerated;
-+     int16_t *coeffs_buf_arm[4];
-++    unsigned int coeffs_buf_vc[4];
-++
-+     int num_coeffs[4];
-+     int num_xfm_cmds;
-+     int num_mv_cmds;
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index 3e6dabf..a295d3e 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1058,9 +1058,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             s->num_coeffs[0] += n;
-+         }
-+     }
-++    // We now do the memset after transform_add while we know the data is cached.
-++    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-++#else
-++    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+ #endif
-+ 
-+-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-++
-+ 
-+     // Derive QP for dequant
-+     if (!lc->cu.cu_transquant_bypass_flag) {
-+@@ -1547,7 +1551,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+ #ifdef RPI
-+     if (s->enable_rpi) {
-+         HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-+-        //memcpy(coeffs2, coeffs, sizeof(int16_t) * trafo_size * trafo_size); // TODO
-+         cmd->type = RPI_PRED_TRANSFORM_ADD;
-+         cmd->size = log2_trafo_size;
-+         cmd->buf = coeffs;
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index 6d772d7..4f13622 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -1,4 +1,10 @@
-+ unsigned char rpi_hevc_transform [] = {
-++21,
-++106,
-++0,
-++144,
-++35,
-++1,
-+ 169,
-+ 3,
-+ 62,
-+@@ -577,4 +583,48 @@ unsigned char rpi_hevc_transform [] = {
-+ 0,
-+ 111,
-+ 3,
-++4,
-++254,
-++0,
-++128,
-++0,
-++4,
-++0,
-++248,
-++0,
-++0,
-++2,
-++232,
-++32,
-++0,
-++0,
-++0,
-++140,
-++248,
-++32,
-++0,
-++0,
-++0,
-++224,
-++35,
-++0,
-++0,
-++64,
-++232,
-++0,
-++2,
-++0,
-++0,
-++193,
-++232,
-++0,
-++1,
-++0,
-++0,
-++1,
-++106,
-++116,
-++30,
-++90,
-++0,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index afdb32a..fd159bc 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -78,8 +78,11 @@
-+ # num: number of 16x16 transforms to be done
-+ # coeffs32
-+ # num32: number of 32x32 transforms
-++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+ #
-+ hevc_trans_16x16:
-++  cmp r5,1
-++  beq memclear16
-+   push r6-r15, lr # TODO cut down number of used registers
-+   mov r14,r3 # coeffs32
-+   mov r15,r4 # num32
-+@@ -266,3 +269,16 @@ trans32:
-+   add r0,r8,32
-+   vsth VX(48,32++),(r0+=r6) REP 16
-+   pop pc
-++
-++memclear16:
-++  # r0 is address
-++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
-++  vmov HX(0++,0),0 REP 16
-++  mov r2,32
-++loop:
-++  vsth HX(0++,0),(r0+=r2) REP 16
-++  add r0,16*16*2
-++  sub r1,16*16
-++  cmp r1,0
-++  bgt loop
-++  b lr
-+-- 
-+2.5.0
-+
-+
-+From dffd0d9fc1ada2b61c61c73cba53538e564ced02 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 6 May 2015 09:56:43 +0100
-+Subject: [PATCH 08/68] Prepared inter offload
-+
-+---
-+ libavcodec/hevc.c       | 116 +++++++++++++++++++++++++++++++++++++++++++-----
-+ libavcodec/hevc.h       |  29 +++++++++++-
-+ libavcodec/hevc_cabac.c |   5 ++-
-+ 3 files changed, 137 insertions(+), 13 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 12e66a6..7453b63 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -43,6 +43,8 @@
-+ #include "rpi_qpu.h"
-+ // For some unknown reason, the code seems to crash if I do a late malloc
-+ #define EARLY_MALLOC
-++// Move Inter prediction into separate pass
-++//#define RPI_INTER
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -1427,6 +1429,95 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-+  * @param luma_offset additive offset applied to the luma prediction value
-+  */
-+ 
-++#ifdef RPI_INTER
-++#define RPI_REDIRECT(fn) rpi_ ## fn
-++static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-++                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
-++                        int block_w, int block_h, int luma_weight, int luma_offset)
-++{
-++    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    cmd->cmd = RPI_CMD_LUMA_UNI;
-++    cmd->dst = dst;
-++    cmd->dststride = dststride;
-++    cmd->src = ref->data[0];
-++    cmd->srcstride = ref->linesize[0];
-++    cmd->mv = *mv;
-++    cmd->x_off = x_off;
-++    cmd->y_off = y_off;
-++    cmd->block_w = block_w;
-++    cmd->block_h = block_h;
-++    cmd->weight = luma_weight;
-++    cmd->offset = luma_offset;
-++}
-++
-++static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-++                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-++{
-++    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    cmd->cmd = RPI_CMD_LUMA_BI;
-++    cmd->dst = dst;
-++    cmd->dststride = dststride;
-++    cmd->src = ref->data[0];
-++    cmd->srcstride = ref->linesize[0];
-++    cmd->mv = *mv;
-++    cmd->x_off = x_off;
-++    cmd->y_off = y_off;
-++    cmd->block_w = block_w;
-++    cmd->block_h = block_h;
-++    cmd->weight = luma_weight;
-++    cmd->offset = luma_offset;
-++    cmd->src1 = ref1->data[];
-++    cmd->srcstride1 = ref1->linesize[0];
-++    cmd->mv1 = *mv1;
-++    cmd->ref_idx[0] = current_mv->ref_idx[0];
-++    cmd->ref_idx[1] = current_mv->ref_idx[1];
-++}
-++
-++static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-++                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
-++{
-++    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    cmd->cmd = RPI_CMD_CHROMA_UNI;
-++    cmd->dst = dst0;
-++    cmd->dststride = dststride;
-++    cmd->src = src0;
-++    cmd->srcstride = srcstride;
-++    cmd->mv = current_mv->mv[reflist];
-++    cmd->x_off = x_off;
-++    cmd->y_off = y_off;
-++    cmd->block_w = block_w;
-++    cmd->block_h = block_h;
-++    cmd->weight = chroma_weight;
-++    cmd->offset = chroma_offset;
-++}
-++
-++static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-++                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
-++{
-++    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-++    cmd->dst = dst0;
-++    cmd->dststride = dststride;
-++    cmd->src = ref0->data[cidx+1];
-++    cmd->srcstride = ref0->linesize[cidx+1];
-++    cmd->mv = current_mv->mv[reflist];
-++    cmd->x_off = x_off;
-++    cmd->y_off = y_off;
-++    cmd->block_w = block_w;
-++    cmd->block_h = block_h;
-++    cmd->weight = chroma_weight;
-++    cmd->offset = chroma_offset;
-++    cmd->src = ref1->data[cidx+1];
-++    cmd->srcstride1 = ref1->linesize[cidx+1];
-++    cmd->ref_idx[0] = current_mv->ref_idx[0];
-++    cmd->ref_idx[1] = current_mv->ref_idx[1];
-++}
-++#else
-++#define RPI_REDIRECT(fn) fn
-++#endif
-++
-+ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                         int block_w, int block_h, int luma_weight, int luma_offset)
-+@@ -1492,7 +1583,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+  * @param mv1 motion vector1 (relative to block position) to get pixel data from
-+  * @param current_mv current motion vector structure
-+  */
-+- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-++static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-+ {
-+@@ -1874,16 +1965,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
-++        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-+                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
-+                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-+                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-++            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-+                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-+-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-++            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-+                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
-+         }
-+@@ -1893,17 +1984,17 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
-++        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-+                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
-+                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-+                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+-            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-++            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-+                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
-+ 
-+-            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-++            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-+                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
-+         }
-+@@ -1913,15 +2004,15 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
-++        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-+                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
-+                    ref1->frame, &current_mv.mv[1], &current_mv);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+-            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-++            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-+                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
-+ 
-+-            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-++            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-+                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
-+         }
-+     }
-+@@ -2452,7 +2543,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+       } else {
-+           int trafo_size = 1 << cmd->size;
-+           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-++#ifdef RPI_PRECLEAR
-+           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-++#endif
-+       }
-+   }
-+   s->num_pred_cmds = 0;
-+@@ -3309,6 +3402,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+     s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+     printf("Done\n");
-++#ifdef RPI_PRECLEAR
-+     //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-+     memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-+     //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-+@@ -3317,6 +3411,8 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-+ #endif
-+ 
-++#endif
-++
-+     s->enable_rpi = 0;
-+ 
-+ #endif
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index ca7c2aa..8ef6f51 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -829,14 +829,39 @@ typedef struct HEVCLocalContext {
-+ // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+ #define RPI_MAX_WIDTH 2048
-+ 
-+-// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane
-+-#define RPI_MAX_MV_CMDS   (16*3*(RPI_MAX_WIDTH/4))
-++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-++#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
-+ #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-+ // Each block can have an intra prediction and a transform_add command
-+ #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-+ 
-++#define RPI_CMD_LUMA_UNI 0
-++#define RPI_CMD_CHROMA_UNI 1
-++#define RPI_CMD_LUMA_BI 2
-++#define RPI_CMD_U_BI 3
-++#define RPI_CMD_V_BI 4
-++
-++// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-++// #define RPI_PRECLEAR
-++
-+ // Command for inter prediction
-+ typedef struct HEVCMvCmd {
-++    int cmd;
-++    uint8_t *dst;
-++    ptrdiff_t dststride;
-++    uint8_t *src;
-++    ptrdiff_t srcstride;
-++    Mv mv;
-++    int x_off;
-++    int y_off;
-++    int block_w;
-++    int block_h;
-++    int weight;
-++    int offset;
-++    uint8_t *src1;
-++    ptrdiff_t srcstride1;
-++    Mv mv1;
-++    int8_t ref_idx[2];
-+ } HEVCMvCmd;
-+ 
-+ // Command for transform to process a block of coefficients
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index a295d3e..f28759b 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1059,7 +1059,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+         }
-+     }
-+     // We now do the memset after transform_add while we know the data is cached.
-+-    //memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-++    #ifdef RPI_PRECLEAR
-++    #else
-++    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-++    #endif
-+ #else
-+     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From fa1aa086848e704e43a90d09ddf35a5e7d99aae2 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 6 May 2015 11:08:50 +0100
-+Subject: [PATCH 09/68] Inter prediction in separate pass
-+
-+---
-+ libavcodec/hevc.c | 93 +++++++++++++++++++++++++++++++++++++++++++++----------
-+ libavcodec/hevc.h |  2 +-
-+ 2 files changed, 77 insertions(+), 18 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 7453b63..83fdb57 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -44,7 +44,7 @@
-+ // For some unknown reason, the code seems to crash if I do a late malloc
-+ #define EARLY_MALLOC
-+ // Move Inter prediction into separate pass
-+-//#define RPI_INTER
-++#define RPI_INTER
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -1435,7 +1435,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                         int block_w, int block_h, int luma_weight, int luma_offset)
-+ {
-+-    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+     cmd->cmd = RPI_CMD_LUMA_UNI;
-+     cmd->dst = dst;
-+     cmd->dststride = dststride;
-+@@ -1454,31 +1454,29 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-+ {
-+-    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+     cmd->cmd = RPI_CMD_LUMA_BI;
-+     cmd->dst = dst;
-+     cmd->dststride = dststride;
-+-    cmd->src = ref->data[0];
-+-    cmd->srcstride = ref->linesize[0];
-+-    cmd->mv = *mv;
-++    cmd->src = ref0->data[0];
-++    cmd->srcstride = ref0->linesize[0];
-++    cmd->mv = *mv0;
-+     cmd->x_off = x_off;
-+     cmd->y_off = y_off;
-+     cmd->block_w = block_w;
-+     cmd->block_h = block_h;
-+-    cmd->weight = luma_weight;
-+-    cmd->offset = luma_offset;
-+-    cmd->src1 = ref1->data[];
-++    cmd->src1 = ref1->data[0];
-+     cmd->srcstride1 = ref1->linesize[0];
-+     cmd->mv1 = *mv1;
-+     cmd->ref_idx[0] = current_mv->ref_idx[0];
-+     cmd->ref_idx[1] = current_mv->ref_idx[1];
-+ }
-+ 
-+-static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-++static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-+                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
-+ {
-+-    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+     cmd->cmd = RPI_CMD_CHROMA_UNI;
-+     cmd->dst = dst0;
-+     cmd->dststride = dststride;
-+@@ -1493,27 +1491,27 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+     cmd->offset = chroma_offset;
-+ }
-+ 
-+-static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-++static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
-+ {
-+-    HEVCMvCmd *cmd = unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-+     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-+     cmd->dst = dst0;
-+     cmd->dststride = dststride;
-+     cmd->src = ref0->data[cidx+1];
-+     cmd->srcstride = ref0->linesize[cidx+1];
-+-    cmd->mv = current_mv->mv[reflist];
-++    cmd->mv = current_mv->mv[0];
-++    cmd->mv1 = current_mv->mv[1];
-+     cmd->x_off = x_off;
-+     cmd->y_off = y_off;
-+     cmd->block_w = block_w;
-+     cmd->block_h = block_h;
-+-    cmd->weight = chroma_weight;
-+-    cmd->offset = chroma_offset;
-+-    cmd->src = ref1->data[cidx+1];
-++    cmd->src1 = ref1->data[cidx+1];
-+     cmd->srcstride1 = ref1->linesize[cidx+1];
-+     cmd->ref_idx[0] = current_mv->ref_idx[0];
-+     cmd->ref_idx[1] = current_mv->ref_idx[1];
-+ }
-++
-+ #else
-+ #define RPI_REDIRECT(fn) fn
-+ #endif
-+@@ -2541,7 +2539,9 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+           lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+           s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-+       } else {
-++#ifdef RPI_PRECLEAR
-+           int trafo_size = 1 << cmd->size;
-++#endif
-+           s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-+ #ifdef RPI_PRECLEAR
-+           memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
-+@@ -2550,6 +2550,61 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+   }
-+   s->num_pred_cmds = 0;
-+ }
-++
-++static void rpi_execute_inter_cmds(HEVCContext *s)
-++{
-++    HEVCMvCmd *cmd = s->unif_mv_cmds;
-++    int n,cidx;
-++    AVFrame myref;
-++    AVFrame myref1;
-++    struct MvField mymv;
-++    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
-++        printf("Overflow inter_cmds\n");
-++        exit(-1);
-++    }
-++    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
-++        switch(cmd->cmd) {
-++        case RPI_CMD_LUMA_UNI:
-++            myref.data[0] = cmd->src;
-++            myref.linesize[0] = cmd->srcstride;
-++            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
-++            break;
-++        case RPI_CMD_LUMA_BI:
-++            myref.data[0] = cmd->src;
-++            myref.linesize[0] = cmd->srcstride;
-++            myref1.data[0] = cmd->src1;
-++            myref1.linesize[0] = cmd->srcstride1;
-++            mymv.ref_idx[0] = cmd->ref_idx[0];
-++            mymv.ref_idx[1] = cmd->ref_idx[1];
-++            luma_mc_bi(s, cmd->dst, cmd->dststride,
-++                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
-++                       &myref1, &cmd->mv1, &mymv);
-++            break;
-++        case RPI_CMD_CHROMA_UNI:
-++            mymv.mv[0] = cmd->mv;
-++            chroma_mc_uni(s, cmd->dst,
-++                          cmd->dststride, cmd->src, cmd->srcstride, 0,
-++                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
-++            break;
-++        case RPI_CMD_CHROMA_BI:
-++        case RPI_CMD_CHROMA_BI+1:
-++            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
-++            myref.data[cidx+1] = cmd->src;
-++            myref.linesize[cidx+1] = cmd->srcstride;
-++            myref1.data[cidx+1] = cmd->src1;
-++            myref1.linesize[cidx+1] = cmd->srcstride1;
-++            mymv.ref_idx[0] = cmd->ref_idx[0];
-++            mymv.ref_idx[1] = cmd->ref_idx[1];
-++            mymv.mv[0] = cmd->mv;
-++            mymv.mv[1] = cmd->mv1;
-++            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
-++                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
-++            break;
-++        }
-++    }
-++    s->num_mv_cmds = 0;
-++}
-++
-+ #endif
-+ 
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+@@ -2598,6 +2653,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI
-+         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-+             int x;
-++            // Perform inter prediction
-++            rpi_execute_inter_cmds(s);
-+             // Transform all blocks
-+             rpi_execute_transform(s);
-+             // Perform intra prediction and residual reconstruction
-+@@ -3350,6 +3407,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+ }
-+ 
-+ #ifdef RPI
-++#ifdef RPI_PRECLEAR
-+ static av_cold void memclear16(int16_t *p, int n)
-+ {
-+   vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-+@@ -3358,6 +3416,7 @@ static av_cold void memclear16(int16_t *p, int n)
-+   //  p[i] = 0;
-+ }
-+ #endif
-++#endif
-+ 
-+ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+ {
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 8ef6f51..8115d04 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -838,7 +838,7 @@ typedef struct HEVCLocalContext {
-+ #define RPI_CMD_LUMA_UNI 0
-+ #define RPI_CMD_CHROMA_UNI 1
-+ #define RPI_CMD_LUMA_BI 2
-+-#define RPI_CMD_U_BI 3
-++#define RPI_CMD_CHROMA_BI 3
-+ #define RPI_CMD_V_BI 4
-+ 
-+ // RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-+-- 
-+2.5.0
-+
-+
-+From eba684df008749ec0f5751ea2343198006682a1c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 6 May 2015 13:03:50 +0100
-+Subject: [PATCH 10/68] Added VPU thread
-+
-+---
-+ libavcodec/hevc.c    |  11 +++--
-+ libavcodec/hevc.h    |   1 +
-+ libavcodec/rpi_qpu.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++--
-+ libavcodec/rpi_qpu.h |   2 +
-+ 4 files changed, 133 insertions(+), 6 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 83fdb57..9b3edf2 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2516,8 +2516,10 @@ static void rpi_execute_transform(HEVCContext *s)
-+ 
-+ 
-+     gpu_cache_flush(&s->coeffs_buf_accelerated);
-+-    vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-++    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-++    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-++    //vpu_wait(s->vpu_id);
-+ 
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+@@ -2653,10 +2655,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI
-+         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-+             int x;
-+-            // Perform inter prediction
-+-            rpi_execute_inter_cmds(s);
-+             // Transform all blocks
-+             rpi_execute_transform(s);
-++            // Perform inter prediction
-++            rpi_execute_inter_cmds(s);
-++            // Wait for transform completion
-++            vpu_wait(s->vpu_id);
-+             // Perform intra prediction and residual reconstruction
-+             rpi_execute_pred_cmds(s);
-+             // Perform deblocking for CTBs in this row
-+@@ -3349,6 +3353,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->univ_pred_cmds);
-+ 
-+ #ifdef EARLY_MALLOC
-++    printf("hevc_decode_free\n");
-+     if (s->coeffs_buf_arm[0]) {
-+       gpu_free(&s->coeffs_buf_default);
-+       s->coeffs_buf_arm[0] = 0;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 8115d04..d5d3302 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -922,6 +922,7 @@ typedef struct HEVCContext {
-+     int num_xfm_cmds;
-+     int num_mv_cmds;
-+     int num_pred_cmds;
-++    int vpu_id;
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 12ad5fb..378dd74 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -1,9 +1,13 @@
-+ #ifdef RPI
-+-// Use the vcsm device for shared memory
-++// define RPI_USE_VCSM to use the vcsm device for shared memory
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+ #define RPI_USE_VCSM
-+-#define RPI_TIME_TOTAL_QPU
-+-#define RPI_TIME_TOTAL_VPU
-++// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-++//#define RPI_TIME_TOTAL_QPU
-++// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-++//#define RPI_TIME_TOTAL_VPU
-++// define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-++#define RPI_ASYNC
-+ 
-+ #include <stdio.h>
-+ #include <stdlib.h>
-+@@ -113,6 +117,19 @@ static unsigned int Microseconds(void) {
-+ }
-+ #endif
-+ 
-++#ifdef RPI_ASYNC
-++pthread_t vpu_thread;
-++static void *vpu_start(void *arg);
-++
-++#define MAXCMDS 128
-++static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
-++static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-++
-++static int vpu_cmds[MAXCMDS][8];
-++static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-++static volatile int vpu_async_head=0;
-++#endif
-++
-+ // Connect to QPU, returns 0 on success.
-+ static int gpu_init(volatile struct GPU **gpu) {
-+   int mb = mbox_open();
-+@@ -164,12 +181,27 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   // And the transform coefficients
-+   memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+ 
-++#ifdef RPI_ASYNC
-++  {
-++    int err;
-++    vpu_async_tail = 0;
-++    vpu_async_head = 0;
-++    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
-++    //printf("Created thread\n");
-++    if (err) {
-++        printf("Failed to create vpu thread\n");
-++        return -4;
-++    }
-++  }
-++#endif
-++
-+   return 0;
-+ }
-+ 
-+ // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+ static void gpu_lock(void) {
-+   pthread_mutex_lock(&gpu_mutex);
-++
-+   if (gpu==NULL) {
-+     gpu_init(&gpu);
-+   }
-+@@ -264,6 +296,16 @@ static void gpu_term(void)
-+ 	unsigned handle = gpu->vc_handle;
-+   if (gpu==NULL)
-+     return;
-++
-++#ifdef RPI_ASYNC
-++  {
-++    void *res;
-++    vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
-++    pthread_join(vpu_thread, &res);
-++  }
-++#endif
-++
-++
-+ 	unmapmem((void*)gpu, sizeof(struct GPU));
-+ 	mem_unlock(mb, handle);
-+ 	mem_free(mb, handle);
-+@@ -322,6 +364,79 @@ unsigned int vpu_get_constants(void) {
-+   return gpu->vc + offsetof(struct GPU,transMatrix2even);
-+ }
-+ 
-++#ifdef RPI_ASYNC
-++
-++static void *vpu_start(void *arg) {
-++  while(1) {
-++    pthread_mutex_lock(&post_mutex);
-++    while( vpu_async_tail - vpu_async_head <= 0)
-++    {
-++      //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
-++      pthread_cond_wait(&post_cond, &post_mutex);
-++    }
-++    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-++    pthread_mutex_unlock(&post_mutex);
-++
-++    if (p[6] == -1) {
-++      break; // Last job
-++    }
-++    if (p[7]) {
-++        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-++        //gpu_cache_flush(buf);
-++    }
-++    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-++
-++    pthread_mutex_lock(&post_mutex);
-++    vpu_async_head++;
-++    pthread_cond_broadcast(&post_cond);
-++    pthread_mutex_unlock(&post_mutex);
-++  }
-++
-++  return NULL;
-++}
-++
-++// Post a command to the queue
-++// Returns an id which we can use to wait for completion
-++int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
-++{
-++  pthread_mutex_lock(&post_mutex);
-++  {
-++    int id = vpu_async_tail++;
-++    int *p = vpu_cmds[id%MAXCMDS];
-++    int num = vpu_async_tail - vpu_async_head;
-++    if (num>MAXCMDS) {
-++      printf("Too many commands submitted\n");
-++      exit(-1);
-++    }
-++    p[0] = code;
-++    p[1] = r0;
-++    p[2] = r1;
-++    p[3] = r2;
-++    p[4] = r3;
-++    p[5] = r4;
-++    p[6] = r5;
-++    p[7] = (int) buf;
-++    if (num<=1)
-++      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
-++    pthread_mutex_unlock(&post_mutex);
-++    return id;
-++  }
-++}
-++
-++// Wait for completion of the given command
-++void vpu_wait(int id)
-++{
-++  pthread_mutex_lock(&post_mutex);
-++  while( id + 1 - vpu_async_head > 0)
-++  {
-++    pthread_cond_wait(&post_cond, &post_mutex);
-++  }
-++  pthread_mutex_unlock(&post_mutex);
-++}
-++
-++#endif
-++
-++
-+ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
-+ {
-+   unsigned r;
-+@@ -334,7 +449,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
-+   static int count=0;
-+   static long long countr2=0;
-+ #endif
-++#ifndef RPI_ASYNC
-+   gpu_lock();
-++#endif
-+ #ifdef RPI_TIME_TOTAL_VPU
-+   start_time = Microseconds();
-+   if (last_time==0)
-+@@ -351,7 +468,9 @@ unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2,
-+   if ((count&0x7f)==0)
-+     printf("VPU %d %lld On=%dms, Off=%dms\n",count,countr2,(int)(on_time/1000),(int)(off_time/1000));
-+ #endif
-++#ifndef RPI_ASYNC
-+   gpu_unlock();
-++#endif
-+   return r;
-+ }
-+ 
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 814fc3c..3526fce 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -36,6 +36,8 @@ extern unsigned int qpu_get_fn(int num);
-+ extern unsigned int vpu_get_fn(void);
-+ extern unsigned int vpu_get_constants(void);
-+ extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-++extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-++extern void vpu_wait( int id);
-+ 
-+ // Simple test of shader code
-+ extern int rpi_test_shader(void);
-+-- 
-+2.5.0
-+
-+
-+From d0d2aad80a05a30b2aca1d96dec3856c3a8d0ab9 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 6 May 2015 15:03:37 +0100
-+Subject: [PATCH 11/68] Added different signal when tail moves
-+
-+---
-+ libavcodec/rpi_qpu.c | 11 ++++++-----
-+ 1 file changed, 6 insertions(+), 5 deletions(-)
-+
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 378dd74..d1c3e20 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -122,7 +122,8 @@ pthread_t vpu_thread;
-+ static void *vpu_start(void *arg);
-+ 
-+ #define MAXCMDS 128
-+-static pthread_cond_t post_cond = PTHREAD_COND_INITIALIZER;
-++static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
-++static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
-+ static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ 
-+ static int vpu_cmds[MAXCMDS][8];
-+@@ -372,7 +373,7 @@ static void *vpu_start(void *arg) {
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
-+-      pthread_cond_wait(&post_cond, &post_mutex);
-++      pthread_cond_wait(&post_cond_tail, &post_mutex);
-+     }
-+     int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-+     pthread_mutex_unlock(&post_mutex);
-+@@ -388,7 +389,7 @@ static void *vpu_start(void *arg) {
-+ 
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+-    pthread_cond_broadcast(&post_cond);
-++    pthread_cond_broadcast(&post_cond_head);
-+     pthread_mutex_unlock(&post_mutex);
-+   }
-+ 
-+@@ -417,7 +418,7 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
-+     p[6] = r5;
-+     p[7] = (int) buf;
-+     if (num<=1)
-+-      pthread_cond_broadcast(&post_cond); // Otherwise the vpu thread must already be awake
-++      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-+     pthread_mutex_unlock(&post_mutex);
-+     return id;
-+   }
-+@@ -429,7 +430,7 @@ void vpu_wait(int id)
-+   pthread_mutex_lock(&post_mutex);
-+   while( id + 1 - vpu_async_head > 0)
-+   {
-+-    pthread_cond_wait(&post_cond, &post_mutex);
-++    pthread_cond_wait(&post_cond_head, &post_mutex);
-+   }
-+   pthread_mutex_unlock(&post_mutex);
-+ }
-+-- 
-+2.5.0
-+
-+
-+From dcb7e7134ab80be7971979f9893a83814d7ea962 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 7 May 2015 08:57:11 +0100
-+Subject: [PATCH 12/68] Add option to test for gpu_idle
-+
-+---
-+ libavcodec/hevc.c    |  3 ++-
-+ libavcodec/rpi_qpu.c | 18 ++++++++++++++++++
-+ 2 files changed, 20 insertions(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 9b3edf2..84cc636 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2514,7 +2514,6 @@ static void rpi_execute_transform(HEVCContext *s)
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-
-+     gpu_cache_flush(&s->coeffs_buf_accelerated);
-+     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-+     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+@@ -2656,6 +2655,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-+             int x;
-+             // Transform all blocks
-++            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-++
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index d1c3e20..85f49db 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -199,6 +199,17 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   return 0;
-+ }
-+ 
-++// Returns 1 if the gpu is currently idle
-++static int gpu_idle(void)
-++{
-++  int ret = pthread_mutex_trylock(&gpu_mutex);
-++  if (ret==0) {
-++    pthread_mutex_unlock(&gpu_mutex);
-++    return 1;
-++  }
-++  return 0;
-++}
-++
-+ // Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+ static void gpu_lock(void) {
-+   pthread_mutex_lock(&gpu_mutex);
-+@@ -400,6 +411,13 @@ static void *vpu_start(void *arg) {
-+ // Returns an id which we can use to wait for completion
-+ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
-+ {
-++  // If the gpu is idle then just run the command immediately
-++  // This works, but doesn't seem to give any benefit
-++  // if (gpu_idle()) {
-++  //   vpu_execute_code( code,  r0,  r1,  r2,  r3,  r4,  r5);
-++  //   return -1; // TODO perhaps a wraparound bug here?
-++  // }
-++
-+   pthread_mutex_lock(&post_mutex);
-+   {
-+     int id = vpu_async_tail++;
-+-- 
-+2.5.0
-+
-+
-+From 44d05d44ab3f81fec1ba75082ca2fe9340cb229c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 7 May 2015 11:01:35 +0100
-+Subject: [PATCH 13/68] Added deblocking pass
-+
-+---
-+ libavcodec/hevc.c        | 33 +++++++++++++++++++++++++++------
-+ libavcodec/hevc.h        |  7 ++++++-
-+ libavcodec/hevc_filter.c |  6 +++++-
-+ libavcodec/rpi_qpu.c     |  2 +-
-+ 4 files changed, 39 insertions(+), 9 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 84cc636..57b0b63 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2505,6 +2505,17 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+ }
-+ 
-+ #ifdef RPI
-++static void rpi_execute_dblk_cmds(HEVCContext *s)
-++{
-++    int n;
-++    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-++    int (*p)[2] = s->dblk_cmds;
-++    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
-++        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
-++    }
-++    s->num_dblk_cmds = 0;
-++}
-++
-+ static void rpi_execute_transform(HEVCContext *s)
-+ {
-+     int i=2;
-+@@ -2618,7 +2629,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-+ #ifdef RPI
-+-    int start_ctb_x = (s->sh.slice_ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-+     s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-+ #endif
-+ 
-+@@ -2652,7 +2662,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ #ifdef RPI
-+-        if (s->enable_rpi && x_ctb + ctb_size >= s->ps.sps->width) {
-++        if (s->enable_rpi) {
-++          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-++          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-++          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+             int x;
-+             // Transform all blocks
-+             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+@@ -2665,10 +2678,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+             // Perform intra prediction and residual reconstruction
-+             rpi_execute_pred_cmds(s);
-+             // Perform deblocking for CTBs in this row
-+-            for(x = start_ctb_x; x <= x_ctb; x += ctb_size) {  // TODO this will fail for tiles
-+-                ff_hevc_hls_filters(s, x, y_ctb, ctb_size);
-+-            }
-+-            start_ctb_x = 0;
-++            rpi_execute_dblk_cmds(s);
-++          }
-+         }
-+ #endif
-+         if (more_data < 0) {
-+@@ -2686,6 +2697,16 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
-+     }
-+ 
-++#ifdef RPI
-++    if (s->enable_rpi && s->num_dblk_cmds) {
-++        rpi_execute_transform(s);
-++        rpi_execute_inter_cmds(s);
-++        vpu_wait(s->vpu_id);
-++        rpi_execute_pred_cmds(s);
-++        rpi_execute_dblk_cmds(s);
-++    }
-++#endif
-++
-+     if (x_ctb + ctb_size >= s->ps.sps->width &&
-+         y_ctb + ctb_size >= s->ps.sps->height)
-+         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index d5d3302..0b4c175 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -834,6 +834,8 @@ typedef struct HEVCLocalContext {
-+ #define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-+ // Each block can have an intra prediction and a transform_add command
-+ #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-++// Worst case is 16x16 CTUs
-++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
-+ 
-+ #define RPI_CMD_LUMA_UNI 0
-+ #define RPI_CMD_CHROMA_UNI 1
-+@@ -893,6 +895,9 @@ typedef struct HEVCPredCmd {
-+ #endif
-+ 
-+ typedef struct HEVCContext {
-++#ifdef RPI
-++    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
-++#endif
-+     const AVClass *c;  // needed by private avoptions
-+     AVCodecContext *avctx;
-+ 
-+@@ -917,11 +922,11 @@ typedef struct HEVCContext {
-+     GPU_MEM_PTR_T coeffs_buf_accelerated;
-+     int16_t *coeffs_buf_arm[4];
-+     unsigned int coeffs_buf_vc[4];
-+-
-+     int num_coeffs[4];
-+     int num_xfm_cmds;
-+     int num_mv_cmds;
-+     int num_pred_cmds;
-++    int num_dblk_cmds;
-+     int vpu_id;
-+ #endif
-+ 
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index e4c3da7..ea0af91 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -877,8 +877,12 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             if (s->threads_type & FF_THREAD_FRAME )
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+         }
-+-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
-++    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
-++        int newh = y + ctb_size - 4;
-++        //int currh = s->ref->tf.progress->data[0];
-++        //if (((y + ctb_size)&63)==0)
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-++    }
-+ }
-+ 
-+ void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 85f49db..3b6dae7 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -105,7 +105,7 @@ struct GPU
-+ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ static volatile struct GPU* gpu = NULL;
-+ 
-+-#ifdef RPI_TIME_TOTAL_QPU
-++#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
-+ static unsigned int Microseconds(void) {
-+     struct timespec ts;
-+     unsigned int x;
-+-- 
-+2.5.0
-+
-+
-+From c4e1242d732ea2a14ce7cee5fb36e79bd2d8db35 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 7 May 2015 16:47:47 +0100
-+Subject: [PATCH 14/68] Added option to disable deblocking for non-ref frames
-+
-+---
-+ libavcodec/hevc_filter.c | 10 ++++++++++
-+ 1 file changed, 10 insertions(+)
-+
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index ea0af91..2cdd621 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -25,6 +25,8 @@
-+ //#define DISABLE_SAO
-+ //#define DISABLE_DEBLOCK
-+ //#define DISABLE_STRENGTHS
-++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
-++//#define DISABLE_DEBLOCK_NONREF
-+ 
-+ #include "libavutil/common.h"
-+ #include "libavutil/internal.h"
-+@@ -504,6 +506,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                 s->ps.sps->pcm.loop_filter_disable_flag) ||
-+                s->ps.pps->transquant_bypass_enable_flag;
-+ 
-++#ifdef DISABLE_DEBLOCK_NONREF
-++    if (    s->nal_unit_type == NAL_TRAIL_N ||
-++            s->nal_unit_type == NAL_TSA_N   ||
-++            s->nal_unit_type == NAL_STSA_N  ||
-++            s->nal_unit_type == NAL_RADL_N  ||
-++            s->nal_unit_type == NAL_RASL_N )
-++      return; // Don't deblock non-reference frames
-++#endif
-+ #ifdef DISABLE_DEBLOCK
-+     return;
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From 8b2f1cd9b31d0c1ded0b00d4106b18897c1450e5 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 11 May 2015 10:00:27 +0100
-+Subject: [PATCH 15/68] Moved buffers to VPU memory
-+
-+---
-+ libavcodec/hevc_filter.c | 17 +++++++++++++-
-+ libavcodec/utils.c       | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
-+ libavutil/buffer.c       |  6 +++++
-+ libavutil/buffer.h       |  3 +++
-+ 4 files changed, 84 insertions(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 2cdd621..e1b32d4 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -866,6 +866,13 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+ #undef CB
-+ #undef CR
-+ 
-++#ifdef RPI_INTER_QPU
-++static void flush_buffer(AVBufferRef *bref) {
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++    gpu_cache_flush(p);
-++}
-++#endif
-++
-+ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+ {
-+     int x_end = x >= s->ps.sps->width  - ctb_size;
-+@@ -888,9 +895,17 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+         }
-+     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
-+-        int newh = y + ctb_size - 4;
-++        //int newh = y + ctb_size - 4;
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-++        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
-++            s->nal_unit_type == NAL_TSA_N   ||
-++            s->nal_unit_type == NAL_STSA_N  ||
-++            s->nal_unit_type == NAL_RADL_N  ||
-++            s->nal_unit_type == NAL_RASL_N )) {
-++            flush_buffer(s->frame->buf[1]);
-++            flush_buffer(s->frame->buf[2]);
-++        }
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+ }
-+diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-+index 892ddb9..df750a8 100644
-+--- a/libavcodec/utils.c
-++++ b/libavcodec/utils.c
-+@@ -26,6 +26,12 @@
-+  */
-+ 
-+ #include "config.h"
-++
-++#ifdef RPI
-++// Move video buffers to GPU memory
-++#define RPI_GPU_BUFFERS
-++#endif
-++
-+ #include "libavutil/atomic.h"
-+ #include "libavutil/attributes.h"
-+ #include "libavutil/avassert.h"
-+@@ -70,6 +76,10 @@
-+ #include "libavutil/ffversion.h"
-+ const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
-+ 
-++#ifdef RPI_GPU_BUFFERS
-++#include "rpi_qpu.h"
-++#endif
-++
-+ #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
-+ static int default_lockmgr_cb(void **arg, enum AVLockOp op)
-+ {
-+@@ -505,6 +515,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
-+     return ret;
-+ }
-+ 
-++#ifdef RPI_GPU_BUFFERS
-++static void rpi_buffer_default_free(void *opaque, uint8_t *data)
-++{
-++    GPU_MEM_PTR_T *p = opaque;
-++    gpu_free(p);
-++    av_free(p);
-++}
-++
-++static AVBufferRef *rpi_buffer_alloc(int size)
-++{
-++    AVBufferRef *ret = NULL;
-++    uint8_t    *data = NULL;
-++    GPU_MEM_PTR_T *p;
-++
-++    static int total=0;
-++    total+=size;
-++
-++    p = av_malloc(sizeof *p);
-++    if (!p)
-++        return NULL;
-++
-++    if (gpu_malloc_cached(size,p)<0)  // Change this line to choose cached or uncached memory.  The caching here refers to the ARM data cache.
-++        return NULL;
-++
-++    data = p->arm;
-++    printf("Rpi alloc %d/%d ARM=%p VC=%x->%x\n",size,total,p->arm,p->vc,p->vc+size);
-++    //memset(data, 64, size);
-++
-++    if (!data)
-++        return NULL;
-++
-++    ret = av_buffer_create(data, size, rpi_buffer_default_free, p, 0);
-++    if (!ret) {
-++        gpu_free(p);
-++        av_freep(&p);
-++    }
-++
-++    return ret;
-++}
-++#endif
-++
-+ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
-+ {
-+     FramePool *pool = avctx->internal->pool;
-+@@ -549,6 +600,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
-+             av_buffer_pool_uninit(&pool->pools[i]);
-+             pool->linesize[i] = picture.linesize[i];
-+             if (size[i]) {
-++#ifdef RPI_GPU_BUFFERS
-++                if (avctx->codec_id == AV_CODEC_ID_HEVC)
-++                    pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
-++                                                     CONFIG_MEMORY_POISONING ?
-++                                                        NULL :
-++                                                        rpi_buffer_alloc);
-++                else
-++#endif
-+                 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
-+                                                      CONFIG_MEMORY_POISONING ?
-+                                                         NULL :
-+diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-+index bb112c2..7f8bfab 100644
-+--- a/libavutil/buffer.c
-++++ b/libavutil/buffer.c
-+@@ -400,3 +400,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
-+ 
-+     return ret;
-+ }
-++
-++// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
-++void *av_buffer_pool_opaque(AVBufferRef *ref) {
-++  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
-++  return buf->opaque;
-++}
-+diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-+index b4399fd..0489002 100644
-+--- a/libavutil/buffer.h
-++++ b/libavutil/buffer.h
-+@@ -267,6 +267,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
-+  */
-+ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
-+ 
-++// Return the opaque for the underlying frame
-++void *av_buffer_pool_opaque(AVBufferRef *ref);
-++
-+ /**
-+  * @}
-+  */
-+-- 
-+2.5.0
-+
-+
-+From a51c8db9d5ed7d90ad83d7791dd8924911a88bd7 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 11 May 2015 14:04:37 +0100
-+Subject: [PATCH 16/68] Prepared QPU execute code
-+
-+---
-+ libavcodec/hevc.c        | 227 ++++++++++++++++++++++++++++++++++++++++-------
-+ libavcodec/hevc.h        |  22 ++++-
-+ libavcodec/hevc_filter.c |   7 +-
-+ libavcodec/rpi_qpu.c     |  55 +++++++++++-
-+ libavcodec/rpi_qpu.h     |   2 +
-+ 5 files changed, 276 insertions(+), 37 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 57b0b63..d055b47 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -40,17 +40,45 @@
-+ #include "hevc.h"
-+ 
-+ #ifdef RPI
-+-#include "rpi_qpu.h"
-+-// For some unknown reason, the code seems to crash if I do a late malloc
-+-#define EARLY_MALLOC
-+-// Move Inter prediction into separate pass
-+-#define RPI_INTER
-++  #include "rpi_qpu.h"
-++  // For some unknown reason, the code seems to crash if I do a late malloc
-++  #define EARLY_MALLOC
-++  // Move Inter prediction into separate pass
-++  #define RPI_INTER
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+ 
-+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ 
-++
-++#ifdef RPI_INTER_QPU
-++
-++#define RPI_CHROMA_COMMAND_WORDS 12
-++// The QPU code for UV blocks only works up to a block width of 8
-++#define RPI_CHROMA_BLOCK_WIDTH 8
-++
-++#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
-++
-++// TODO Chroma only needs 4 taps
-++static uint32_t rpi_filter_coefs[8][2] = {
-++        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
-++};
-++
-++static uint32_t get_vc_address(AVBufferRef *bref) {
-++  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++  return p->vc;
-++}
-++
-++#endif
-++
-+ /**
-+  * NOTE: Each function hls_foo correspond to the function foo in the
-+  * specification (HLS stands for High Level Syntax).
-+@@ -64,6 +92,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ static void pic_arrays_free(HEVCContext *s)
-+ {
-+ #ifdef RPI
-++
-+ #ifdef EARLY_MALLOC
-+ #else
-+     printf("pic_arrays_free\n");
-+@@ -1969,6 +1998,43 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-++#ifdef RPI_INTER_QPU
-++            if (s->enable_rpi) {
-++                int reflist = 0;
-++                int hshift           = s->ps.sps->hshift[1];
-++                int vshift           = s->ps.sps->vshift[1];
-++                const Mv *mv         = &current_mv.mv[reflist];
-++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-++                intptr_t _mx         = mx << (1 - hshift);
-++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-++
-++                int x1_c = x0_c + (mv->x >> (2 + hshift));
-++                int y1_c = y0_c + (mv->y >> (2 + hshift));
-++                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++
-++                uint32_t *u = s->u_mvs[chan & 7];
-++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-++                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-++                      *u++ = rpi_filter_coefs[_mx][0];
-++                      *u++ = rpi_filter_coefs[_mx][1];
-++                      *u++ = rpi_filter_coefs[_my][0];
-++                      *u++ = rpi_filter_coefs[_my][1];
-++                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-++                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++                    }
-++                }
-++                s->u_mvs[chan & 7] = u;
-++                return;
-++            }
-++#endif
-+             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-+                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-+@@ -2619,6 +2685,54 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ 
-+ #endif
-+ 
-++#ifdef RPI_INTER_QPU
-++static void rpi_inter_clear(HEVCContext *s)
-++{
-++    int i;
-++    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-++    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-++    for(i=0;i<8;i++) {
-++        s->u_mvs[i] = s->mvs_base[i];
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = pic_width;
-++        *s->u_mvs[i]++ = pic_height;
-++        *s->u_mvs[i]++ = s->frame->linesize[1];
-++        *s->u_mvs[i]++ = s->frame->linesize[2];
-++        s->u_mvs[i] += 3;  // Padding words
-++    }
-++}
-++
-++static void rpi_execute_inter_qpu(HEVCContext *s)
-++{
-++    int k;
-++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-++
-++    if (s->sh.slice_type == I_SLICE)
-++        return;
-++    for(k=0;k<8;k++) {
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++    }
-++
-++    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++
-++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-++      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++      );
-++}
-++#endif
-++
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ {
-+     HEVCContext *s  = avctxt->priv_data;
-+@@ -2645,6 +2759,10 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         }
-+     }
-+ 
-++#ifdef RPI_INTER_QPU
-++    rpi_inter_clear(s);
-++#endif
-++
-+     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-+         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+ 
-+@@ -2666,19 +2784,30 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-+           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-+           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+-            int x;
-++#ifdef RPI_INTER_QPU
-++            // Kick off inter prediction on QPUs
-++            rpi_execute_inter_qpu(s);
-++#endif
-+             // Transform all blocks
-+             //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+-
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-+             // Wait for transform completion
-+             vpu_wait(s->vpu_id);
-++
-++            // Copy back reconstructed data
-++            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
-++            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
-++            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
-++
-+             // Perform intra prediction and residual reconstruction
-+             rpi_execute_pred_cmds(s);
-+             // Perform deblocking for CTBs in this row
-+             rpi_execute_dblk_cmds(s);
-++#ifdef RPI_INTER_QPU
-++            rpi_inter_clear(s);
-++#endif
-+           }
-+         }
-+ #endif
-+@@ -2699,6 +2828,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ 
-+ #ifdef RPI
-+     if (s->enable_rpi && s->num_dblk_cmds) {
-++#ifdef RPI_INTER_QPU
-++        rpi_execute_inter_qpu(s);
-++#endif
-+         rpi_execute_transform(s);
-+         rpi_execute_inter_cmds(s);
-+         vpu_wait(s->vpu_id);
-+@@ -3374,6 +3506,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->unif_xfm_cmds);
-+     av_freep(&s->univ_pred_cmds);
-+ 
-++#ifdef RPI_INTER_QPU
-++    if (s->unif_mvs) {
-++        gpu_free( &s->unif_mvs_ptr );
-++        s->unif_mvs = 0;
-++    }
-++#endif
-++    //gpu_free(&s->dummy);
-++
-+ #ifdef EARLY_MALLOC
-+     printf("hevc_decode_free\n");
-+     if (s->coeffs_buf_arm[0]) {
-+@@ -3469,34 +3609,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+ 
-+-    s->coeffs_buf_arm[0] = 0;
-+-    s->coeffs_buf_arm[2] = 0;
-++#ifdef RPI_INTER_QPU
-++    // We divide the image into blocks 256 wide and 64 high
-++    // We support up to 2048 widths
-++    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
-++    // Also add space for the startup command for each stream.
-++
-++    {
-++        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-++        uint32_t *p;
-++        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-++
-++        // Set up initial locations for uniform streams
-++        p = s->unif_mvs;
-++        for(i = 0; i < 8; i++) {
-++            s->mvs_base[i] = p;
-++            p += uv_commands_per_qpu;
-++        }
-++        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-++        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
-++
-++    }
-++#endif
-++    //gpu_malloc_uncached(2048*64,&s->dummy);
-+ 
-+ #ifdef EARLY_MALLOC
-+-    int coeffs_in_ctb = 64*64;
-+-    int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-+-    printf("Allocated %d\n",coefs_per_row);
-+-    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+-    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+-    if (!s->coeffs_buf_arm[0])
-+-        goto fail;
-+-    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+-    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+-    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+-    if (!s->coeffs_buf_arm[2])
-+-        goto fail;
-+-    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+-    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+-    printf("Done\n");
-++    {
-++        int coeffs_in_ctb = 64*64;
-++        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-++        s->coeffs_buf_arm[0] = 0;
-++        s->coeffs_buf_arm[2] = 0;
-++        printf("Allocated %d\n",coefs_per_row);
-++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-++        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-++        if (!s->coeffs_buf_arm[0])
-++            goto fail;
-++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-++        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-++        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-++        if (!s->coeffs_buf_arm[2])
-++            goto fail;
-++        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-++        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-++        printf("Done\n");
-+ #ifdef RPI_PRECLEAR
-+-    //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-+-    memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-+-    //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-+-    memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-+-    //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-+-    memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-++        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-++        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-++        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-++        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-++        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-++        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-+ #endif
-+-
-++    }
-+ #endif
-+ 
-+     s->enable_rpi = 0;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 0b4c175..8923a25 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -41,7 +41,11 @@
-+ 
-+ // define RPI to split the CABAC/prediction/transform into separate stages
-+ #ifdef RPI
-+-#include "rpi_qpu.h"
-++
-++  #include "rpi_qpu.h"
-++  // Use QPU for inter prediction
-++  //#define RPI_INTER_QPU
-++
-+ #endif
-+ 
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+@@ -914,7 +918,7 @@ typedef struct HEVCContext {
-+ 
-+ #ifdef RPI
-+     int enable_rpi;
-+-    HEVCMvCmd *unif_mv_cmds;
-++    HEVCMvCmd *unif_mv_cmds;  // TODO rename
-+     HEVCXfmCmd *unif_xfm_cmds;
-+     HEVCPredCmd *univ_pred_cmds;
-+     int buf_width;
-+@@ -928,6 +932,20 @@ typedef struct HEVCContext {
-+     int num_pred_cmds;
-+     int num_dblk_cmds;
-+     int vpu_id;
-++    //GPU_MEM_PTR_T dummy;
-++#ifdef RPI_INTER_QPU
-++    GPU_MEM_PTR_T unif_mvs_ptr;
-++    uint32_t *unif_mvs; // Base of memory for motion vector commands
-++
-++    // _base pointers are to the start of the row
-++    uint32_t *mvs_base[8];
-++    // these pointers are to the next free space
-++    uint32_t *u_mvs[8];
-++    // Function pointers
-++    uint32_t mc_filter_uv;
-++    uint32_t mc_filter_uv_b;
-++#endif
-++
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index e1b32d4..5b3d759 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -903,8 +903,11 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             s->nal_unit_type == NAL_STSA_N  ||
-+             s->nal_unit_type == NAL_RADL_N  ||
-+             s->nal_unit_type == NAL_RASL_N )) {
-+-            flush_buffer(s->frame->buf[1]);
-+-            flush_buffer(s->frame->buf[2]);
-++            //flush_buffer(s->frame->buf[1]);
-++            //flush_buffer(s->frame->buf[2]);
-++            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-++            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-++            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+         }
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 3b6dae7..e4dd58a 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -3,7 +3,7 @@
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+ #define RPI_USE_VCSM
-+ // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+-//#define RPI_TIME_TOTAL_QPU
-++#define RPI_TIME_TOTAL_QPU
-+ // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+ //#define RPI_TIME_TOTAL_VPU
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+@@ -30,7 +30,7 @@
-+ #endif
-+ 
-+ // On Pi2 there is no way to access the VPU L2 cache
-+-// GPU_MEM_FLG should be 4 for uncached memory.
-++// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+ // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-+ // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-+ #define GPU_MEM_FLG 0xC
-+@@ -549,6 +549,54 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
-+   gpu_unlock();
-+ }
-+ 
-++// Run a program on 8 QPUs with the given code and uniform stream (given in GPU addresses)
-++void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-++{
-++  int i;
-++#ifdef RPI_TIME_TOTAL_QPU
-++  static int last_time=0;
-++  static long long on_time=0;
-++  static long long off_time=0;
-++  int start_time;
-++  int end_time;
-++  static int count=0;
-++#endif
-++
-++  gpu_lock();
-++#ifdef RPI_TIME_TOTAL_QPU
-++  start_time = Microseconds();
-++  if (last_time==0)
-++    last_time = start_time;
-++  off_time += start_time-last_time;
-++#endif
-++  for(i=0;i<8;i++) {
-++    gpu->mail[i*2 + 1] = code;
-++  }
-++  gpu->mail[0 ] = unifs1;
-++  gpu->mail[2 ] = unifs2;
-++  gpu->mail[4 ] = unifs3;
-++  gpu->mail[6 ] = unifs4;
-++  gpu->mail[8 ] = unifs5;
-++  gpu->mail[10] = unifs6;
-++	gpu->mail[12] = unifs7;
-++	gpu->mail[14] = unifs8;
-++	execute_qpu(
-++		gpu->mb,
-++		8 /* Number of QPUs */,
-++		gpu->vc + offsetof(struct GPU, mail),
-++		1 /* no flush */,  // Don't flush VPU L1 cache
-++		5000 /* timeout ms */);
-++#ifdef RPI_TIME_TOTAL_QPU
-++  end_time = Microseconds();
-++  last_time = end_time;
-++  on_time += end_time - start_time;
-++  count++;
-++  if ((count&0x7f)==0)
-++    printf("On=%dms, Off=%dms\n",(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-++  gpu_unlock();
-++}
-++
-+ unsigned int qpu_get_fn(int num) {
-+     // Make sure that the gpu is initialized
-+     unsigned int *fn;
-+@@ -585,6 +633,9 @@ unsigned int qpu_get_fn(int num) {
-+     case QPU_MC_FILTER_UV_B:
-+       fn = mc_filter_uv_b;
-+       break;
-++    case QPU_MC_INTERRUPT_EXIT8:
-++      fn = mc_interrupt_exit8;
-++      break;
-+     case QPU_MC_END:
-+       fn = mc_end;
-+       break;
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 3526fce..2b22d98 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -16,6 +16,7 @@ extern void gpu_free(GPU_MEM_PTR_T *p);
-+ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-+ 
-+ // QPU specific functions
-++extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+ extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
-+ 
-+ enum {
-+@@ -28,6 +29,7 @@ enum {
-+   QPU_MC_SETUP_UV,
-+   QPU_MC_FILTER_UV,
-+   QPU_MC_FILTER_UV_B,
-++  QPU_MC_INTERRUPT_EXIT8,
-+   QPU_MC_END
-+   };
-+ extern unsigned int qpu_get_fn(int num);
-+-- 
-+2.5.0
-+
-+
-+From 5fc9797992781c83747eadba05b8092cd85ebba7 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 11:47:23 +0100
-+Subject: [PATCH 17/68] Drafted chroma interpolation on QPUs
-+
-+---
-+ libavcodec/hevc.c          |   5 ++-
-+ libavcodec/hevc.h          |   2 +-
-+ libavcodec/hevc_filter.c   |   6 ++-
-+ libavcodec/rpi_qpu.c       | 101 +++++++++++++++++++++++++++++++++++++++++++--
-+ libavcodec/rpi_qpu.h       |   1 +
-+ libavcodec/rpi_shader.c    |  42 +++++++++----------
-+ libavcodec/rpi_shader.qasm |  42 +++++++++----------
-+ 7 files changed, 149 insertions(+), 50 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index d055b47..7897fdd 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -58,11 +58,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+ 
-+-#define ENCODE_COEFFS(c0, c1, c2, c3) (((-c0) & 0xff) | ((-c1) & 0xff) << 8 | ((-c2) & 0xff) << 16 | ((-c3) & 0xff) << 24)
-++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+ // TODO Chroma only needs 4 taps
-+ static uint32_t rpi_filter_coefs[8][2] = {
-+-        { ENCODE_COEFFS(  0,  0,  0, 128), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-++        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-+         { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
-+         { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
-+         { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
-+@@ -2716,6 +2716,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+     for(k=0;k<8;k++) {
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-+     }
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 8923a25..a0d4631 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -44,7 +44,7 @@
-+ 
-+   #include "rpi_qpu.h"
-+   // Use QPU for inter prediction
-+-  //#define RPI_INTER_QPU
-++  // #define RPI_INTER_QPU
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 5b3d759..9b6e26d 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -903,8 +903,10 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             s->nal_unit_type == NAL_STSA_N  ||
-+             s->nal_unit_type == NAL_RADL_N  ||
-+             s->nal_unit_type == NAL_RASL_N )) {
-+-            //flush_buffer(s->frame->buf[1]);
-+-            //flush_buffer(s->frame->buf[2]);
-++#ifdef RPI_INTER_QPU
-++            flush_buffer(s->frame->buf[1]);
-++            flush_buffer(s->frame->buf[2]);
-++#endif
-+             //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+             //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+             //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index e4dd58a..4d9eda8 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -33,7 +33,8 @@
-+ // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+ // However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-+ // The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-+-#define GPU_MEM_FLG 0xC
-++#define GPU_MEM_FLG 0x4
-++// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
-+ #define GPU_MEM_MAP 0x0
-+ 
-+ #define vcos_verify(x) ((x)>=0)
-+@@ -165,6 +166,8 @@ static int gpu_init(volatile struct GPU **gpu) {
-+ 	ptr->vc_handle = handle;
-+ 	ptr->vc = vc;
-+ 
-++  printf("GPU allocated at 0x%x\n",vc);
-++
-+   *gpu = ptr;
-+ 
-+   // Now copy over the QPU code into GPU memory
-+@@ -304,10 +307,13 @@ int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
-+ 
-+ static void gpu_term(void)
-+ {
-+-	int mb = gpu->mb;
-+-	unsigned handle = gpu->vc_handle;
-++	int mb;
-++	unsigned handle;
-++
-+   if (gpu==NULL)
-+     return;
-++  mb = gpu->mb;
-++  handle = gpu->vc_handle;
-+ 
-+ #ifdef RPI_ASYNC
-+   {
-+@@ -648,6 +654,95 @@ unsigned int qpu_get_fn(int num) {
-+ }
-+ 
-+ #if 0
-++typedef unsigned int uint32_t;
-++
-++typedef struct mvs_s {
-++    GPU_MEM_PTR_T unif_mvs_ptr;
-++    uint32_t *unif_mvs; // Base of memory for motion vector commands
-++
-++    // _base pointers are to the start of the row
-++    uint32_t *mvs_base[8];
-++    // these pointers are to the next free space
-++    uint32_t *u_mvs[8];
-++
-++} HEVCContext;
-++
-++#define RPI_CHROMA_COMMAND_WORDS 12
-++
-++static void rpi_inter_clear(HEVCContext *s)
-++{
-++    int i;
-++    for(i=0;i<8;i++) {
-++        s->u_mvs[i] = s->mvs_base[i];
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 0;
-++        *s->u_mvs[i]++ = 128;  // w
-++        *s->u_mvs[i]++ = 128;  // h
-++        *s->u_mvs[i]++ = 128;  // stride u
-++        *s->u_mvs[i]++ = 128;  // stride v
-++        s->u_mvs[i] += 3;  // Padding words
-++    }
-++}
-++
-++static void rpi_execute_inter_qpu(HEVCContext *s)
-++{
-++    int k;
-++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-++
-++    for(k=0;k<8;k++) {
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
-++    }
-++
-++    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++
-++    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-++      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++      );
-++}
-++
-++void rpi_test_qpu(void)
-++{
-++    HEVCContext mvs;
-++    HEVCContext *s = &mvs;
-++    int i;
-++    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-++    uint32_t *p;
-++    printf("Allocate memory\n");
-++    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
-++
-++    // Set up initial locations for uniform streams
-++    p = s->unif_mvs;
-++    for(i = 0; i < 8; i++) {
-++        s->mvs_base[i] = p;
-++        p += uv_commands_per_qpu;
-++    }
-++    // Now run a simple program that should just quit immediately after a single texture fetch
-++    rpi_inter_clear(s);
-++    for(i=0;i<4;i++) {
-++      printf("Launch QPUs\n");
-++      rpi_execute_inter_qpu(s);
-++      printf("Done\n");
-++    }
-++    printf("Free memory\n");
-++    gpu_free(&s->unif_mvs_ptr);
-++    return;
-++}
-++#endif
-++
-++#if 0
-+ 
-+ int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-+ //int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 2b22d98..f9ad333 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -18,6 +18,7 @@ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-+ // QPU specific functions
-+ extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+ extern void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8, int unifs9, int unifs10, int unifs11, int unifs12);
-++extern void rpi_test_qpu(void);
-+ 
-+ enum {
-+   QPU_MC_SETUP,
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 41cc2e1..d7ed297 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -152,23 +152,23 @@ unsigned int rpi_shader[] = {
-+ /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+ /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+ /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000400] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000408] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-/* [0x00000410] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
-++/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000420] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ /* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000430] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ /* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000440] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+ /* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000450] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+ /* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000460] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+ /* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000470] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+ /* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000480] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+ /* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+ /* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+ /* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+@@ -179,20 +179,20 @@ unsigned int rpi_shader[] = {
-+ /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+ /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+ /* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x000004d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
-+ /* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x000004e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x000004f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x000004f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000500] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000508] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000510] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000518] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000520] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000528] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
-+ /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ /* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000540] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+ /* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+ /* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+ /* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 6851e83..02fdcb2 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -270,23 +270,23 @@ add t0s, ra_x2_base, r2
-+ 
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++mov r2, rb21         ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+ nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+ nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+ nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+ nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3
-++add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+@@ -302,23 +302,23 @@ sub.setf -, r3, 8 ; mov r1, ra22
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop
-+ max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-++asr r0, r0, 14          ; mov r1, ra21
-+ min.setf ra15, r0, rb22
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++nop                     ; mul24 r1, ra14, rb14
-++nop                     ; mul24 r0, ra13, rb13
-++add r1, r1, r0          ; mul24 r0, ra12, rb12
-++add r1, r1, r0          ; mul24 r0, ra11, rb11
-++add r1, r1, r0          ; mul24 r0, ra10, rb10
-++add r1, r1, r0          ; mul24 r0, ra9, rb9
-++add r1, r1, r0          ; mul24 r0, ra8, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb15
-++add.ifnn r1, r1, r0     ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ brr.anyn -, r:uvloop
-+-asr r1, r1, 15
-++asr r1, r1, 14
-+ min r1, r1, rb22
-+ max vpm, r1, 0
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 41380ff18142eef6a80ffae43f0c3d810c9384d8 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 13:54:11 +0100
-+Subject: [PATCH 18/68] Fixed chroma inter prediction
-+
-+---
-+ libavcodec/hevc.c          |    8 +-
-+ libavcodec/hevc.h          |    2 +-
-+ libavcodec/rpi_shader.c    | 1170 ++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |   22 +-
-+ libavcodec/rpi_shader.qasm |   24 +-
-+ 5 files changed, 617 insertions(+), 609 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 7897fdd..bcc831e 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -55,9 +55,11 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ #ifdef RPI_INTER_QPU
-+ 
-+ #define RPI_CHROMA_COMMAND_WORDS 12
-++#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+ 
-++
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+ // TODO Chroma only needs 4 taps
-+@@ -2011,7 +2013,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+ 
-+                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+-                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++                int chan = x0>>8;
-+ 
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+@@ -2717,6 +2720,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-++        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-+     }
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+@@ -3617,7 +3621,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     // Also add space for the startup command for each stream.
-+ 
-+     {
-+-        int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-++        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+         uint32_t *p;
-+         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index a0d4631..cae6659 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -44,7 +44,7 @@
-+ 
-+   #include "rpi_qpu.h"
-+   // Use QPU for inter prediction
-+-  // #define RPI_INTER_QPU
-++  #define RPI_INTER_QPU
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index d7ed297..831633b 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -33,7 +33,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+ /* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+ /* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000058] */ 0x00000040, 0xe0020567, // mov ra21, 64
-++/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+ /* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+ /* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+ /* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+@@ -152,7 +152,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+ /* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+ /* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000400] */ 0x55015fc6, 0x100248a2, // mov r2, rb21         ; mul24 r2, r0, ra0
-++/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+ /* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+ /* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ /* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+@@ -178,9 +178,9 @@ unsigned int rpi_shader[] = {
-+ /* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+ /* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+ /* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x000004d8] */ 0x8f54e1f6, 0xd0024821, // asr r0, r0, 14          ; mov r1, ra21
-+-/* [0x000004e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+ /* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+ /* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+ /* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+@@ -189,400 +189,400 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+ /* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+ /* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000528] */ 0x8c9f223f, 0x100a0867, // add.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+ /* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000538] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000540] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter
-+-/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005b0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x000005b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005c0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x000005c8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005d0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x000005d8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005e0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x000005e8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x000005f0] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000600] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000610] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000618] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000620] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000708] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-+-/* [0x00000710] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000718] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000720] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000728] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-++/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :loop
-+-/* [0x00000730] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000738] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000740] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000748] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000750] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000758] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000760] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000768] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000770] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000778] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000780] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000788] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000790] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000007a0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007b0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007c0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007d0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007e0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000007f0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000800] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000848] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-+-/* [0x00000850] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000858] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000860] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000868] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000870] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000878] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000880] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000888] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000890] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000898] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x000008a0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008a8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008b8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-+-/* [0x000008c0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000008d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-++/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-++/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // :fast_path
-+-/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :fast_loop
-+-/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000910] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+-/* [0x00000918] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+-/* [0x00000920] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000928] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-+-/* [0x00000930] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000938] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+-/* [0x00000940] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000948] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000950] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+-/* [0x00000958] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+-/* [0x00000960] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+-/* [0x00000968] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+-/* [0x00000970] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+-/* [0x00000978] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+-/* [0x00000980] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+-/* [0x00000988] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000990] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000998] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000009a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000009a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000009b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000009b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-+-/* [0x000009c8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-+-/* [0x000009d0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x000009d8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x000009e0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x000009e8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x000009f0] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x000009f8] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000a00] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000a08] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000a10] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000a18] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000a20] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000a28] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00000a30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000a38] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-+-/* [0x00000a40] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x00000a48] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a50] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a60] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-++/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-++/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-++/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-++/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-++/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-++/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-++/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-++/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-++/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-++/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-++/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-++/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-++/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-++/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-++/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_b
-+-/* [0x00000a78] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000a80] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000a88] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x00000a90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000a98] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000aa0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000aa8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x00000ab0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000ab8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x00000ac0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000ac8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000ad0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000ad8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000ae0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000ae8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000af0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000b00] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000b08] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000b10] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000b18] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000b20] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000b28] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000b30] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000b38] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000b40] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000b48] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000b50] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000b58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000b60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000b68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000b70] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+-/* [0x00000b78] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000b80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000b88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000b90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000b98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000ba0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ba8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bb0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bb8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000bc0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bc8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bd0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bd8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000be0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000be8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bf0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bf8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000c00] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000c08] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000c10] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :bloop
-+-/* [0x00000c18] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000c20] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000c28] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000c30] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000c38] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000c40] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000c48] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000c50] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000c58] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000c60] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000c70] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000c78] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000c80] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000c88] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000c90] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000c98] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000ca0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000ca8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000cb0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000cb8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000cc0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000cc8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000cd0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000cd8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000ce0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000ce8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x00000cf0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000cf8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000d00] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000d08] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000d10] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000d18] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000d20] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000d28] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000d30] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-+-/* [0x00000d38] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000d40] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000d48] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000d50] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000d58] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000d60] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000d68] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000d70] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000d78] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000d80] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000d88] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000d90] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00000d98] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000da0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-+-/* [0x00000da8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000db0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x00000db8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-+-/* [0x00000dc0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000dc8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00000dd0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x00000dd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000de0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000de8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000df0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-++/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-++/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-++/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_honly
-+-/* [0x00000df8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000e00] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000e08] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x00000e10] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000e18] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000e20] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000e28] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x00000e30] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000e38] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x00000e40] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000e48] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000e50] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000e58] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000e60] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000e68] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000e70] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000e78] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000e80] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000e88] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e90] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000e98] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000ea0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000ea8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-+-/* [0x00000eb0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-+-/* [0x00000eb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000ec0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000ec8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000ed0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000ed8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ee0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ee8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ef0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000ef8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f00] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f08] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f10] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000f20] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000f30] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-++/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-++/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-++/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-++/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-++/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-++/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-++/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-++/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-++/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :loop_honly
-+-/* [0x00000f38] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000f40] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000f48] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000f50] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000f58] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000f60] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000f68] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000f70] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000f78] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000f80] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000f88] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000f90] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000f98] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000fa0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000fa8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000fb0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000fb8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000fc0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000fc8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000fd0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000fd8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000fe0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000fe8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000ff0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000ff8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001000] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001008] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001010] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-+-/* [0x00001018] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-+-/* [0x00001020] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-+-/* [0x00001028] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-+-/* [0x00001030] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-+-/* [0x00001038] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-+-/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001048] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001050] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001058] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-++/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-++/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-++/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-++/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-++/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-++/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-++/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-++/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00001060] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001068] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00001070] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001078] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+ /* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001090] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001098] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x000010a0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_exit1
-+-/* [0x000010a8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000010b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000010d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit
-+-/* [0x000010e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000010f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001110] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001118] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+@@ -592,225 +592,227 @@ unsigned int rpi_shader[] = {
-+ /* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001168] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001170] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00001178] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit4
-+-/* [0x00001180] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001188] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001190] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000011c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000011d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x000011d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000011e0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001200] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001208] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001238] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001240] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00001248] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_setup_uv
-+-/* [0x00001250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001258] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+-/* [0x00001260] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+-/* [0x00001268] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+-/* [0x00001270] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00001278] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-+-/* [0x00001280] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00001288] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x00001290] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x00001298] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000012a0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x000012a8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x000012b0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x000012b8] */ 0x00000040, 0xe0020567, // mov ra21, 64
-+-/* [0x000012c0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x000012c8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x000012d0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x000012d8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+-/* [0x000012e0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x000012e8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x000012f0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x000012f8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x00001300] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x00001308] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x00001310] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x00001318] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x00001320] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x00001328] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x00001330] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00001338] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00001340] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00001348] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00001350] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00001358] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00001360] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00001368] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001370] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00001378] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00001380] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00001388] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00001390] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00001398] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000013a0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x000013a8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x000013b0] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x000013b8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x000013c0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000013c8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x000013d0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x000013d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x000013e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000013e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x000013f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000013f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x00001400] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00001408] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00001410] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x00001418] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00001420] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x00001428] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x00001430] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00001438] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001440] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-++/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-++/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-++/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-++/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
-++/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-++/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-++/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-++/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+ /* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001450] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00001458] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00001460] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00001468] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001470] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00001478] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00001480] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv_b
-+-/* [0x00001488] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001490] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00001498] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000014a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000014a8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000014b0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000014b8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000014c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000014c8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000014d0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000014d8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000014e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000014e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000014f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000014f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00001500] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00001508] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00001510] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00001518] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00001520] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00001528] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00001530] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00001538] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00001540] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001548] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001550] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00001558] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+-/* [0x00001560] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00001568] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001570] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001578] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001580] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001588] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00001590] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001598] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015a0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015a8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000015b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000015d0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015d8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015e0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015e8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x000015f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000015f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001600] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00001608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00001610] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00001618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00001620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00001628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00001630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00001638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00001640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00001648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00001650] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00001658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001660] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00001668] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-/* [0x00001670] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00001678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00001680] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00001688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00001690] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00001698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000016a0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000016a8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000016b0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000016b8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000016c0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000016c8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000016d0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000016d8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x000016e0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x000016e8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000016f0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x000016f8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001700] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001708] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001710] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001718] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001720] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00001728] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00001730] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00001738] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00001740] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00001748] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00001750] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00001758] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00001760] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00001768] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00001770] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00001778] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00001780] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00001788] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00001790] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00001798] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000017a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000017a8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x000017b0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000017b8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x000017c0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x000017c8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x000017d0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000017d8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000017e0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000017e8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000017f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000017f8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001800] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00001808] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001810] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index db971f4..3464cdb 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -5,16 +5,16 @@ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 146)
-+-#define mc_filter (rpi_shader + 360)
-+-#define mc_filter_b (rpi_shader + 670)
-+-#define mc_filter_honly (rpi_shader + 894)
-+-#define mc_exit (rpi_shader + 1048)
-+-#define mc_exit1 (rpi_shader + 1066)
-+-#define mc_interrupt_exit (rpi_shader + 1082)
-+-#define mc_interrupt_exit4 (rpi_shader + 1120)
-+-#define mc_interrupt_exit8 (rpi_shader + 1142)
-+-#define mc_setup_uv (rpi_shader + 1172)
-+-#define mc_filter_uv_b (rpi_shader + 1314)
-+-#define mc_end (rpi_shader + 1542)
-++#define mc_filter (rpi_shader + 364)
-++#define mc_filter_b (rpi_shader + 674)
-++#define mc_filter_honly (rpi_shader + 898)
-++#define mc_exit (rpi_shader + 1052)
-++#define mc_exit1 (rpi_shader + 1070)
-++#define mc_interrupt_exit (rpi_shader + 1086)
-++#define mc_interrupt_exit4 (rpi_shader + 1124)
-++#define mc_interrupt_exit8 (rpi_shader + 1146)
-++#define mc_setup_uv (rpi_shader + 1176)
-++#define mc_filter_uv_b (rpi_shader + 1318)
-++#define mc_end (rpi_shader + 1546)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 02fdcb2..4809e1d 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -21,7 +21,7 @@
-+ # rb19                                          next ra16
-+ #
-+ # ra20                                          1
-+-# ra21                                          64
-++# ra21                                          32
-+ # ra22                                          256
-+ # ra23                                          8
-+ #
-+@@ -97,7 +97,7 @@ add rb24, r1, r0
-+ # load constants
-+ 
-+ mov ra20, 1
-+-mov ra21, 64
-++mov ra21, 32
-+ mov ra22, 256
-+ mov ra23, 8
-+ 
-+@@ -270,7 +270,7 @@ add t0s, ra_x2_base, r2
-+ 
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+-mov r2, rb21         ; mul24 r2, r0, ra0
-++nop                  ; mul24 r2, r0, ra0
-+ nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+ nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+@@ -301,9 +301,9 @@ sub.setf -, r3, 8 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 14          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-++mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-++asr ra15, r0, 8         ; nop
-++nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+@@ -315,12 +315,14 @@ add r1, r1, r0          ; mul24 r0, ra10, rb10
-+ add r1, r1, r0          ; mul24 r0, ra9, rb9
-+ add r1, r1, r0          ; mul24 r0, ra8, rb8
-+ add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-add.ifnn r1, r1, r0     ; mov -, vw_wait
-++add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-brr.anyn -, r:uvloop
-+ asr r1, r1, 14
-+-min r1, r1, rb22
-+-max vpm, r1, 0
-++add r1, r1, ra21
-++brr.anyn -, r:uvloop
-++asr r1, r1, 6          # Delay 1
-++min r1, r1, rb22       # Delay 2
-++max vpm, r1, 0         # Delay 3
-+ 
-+ # DMA out for U
-+ 
-+@@ -1161,7 +1163,7 @@ add rb24, r1, r0
-+ # load constants
-+ 
-+ mov ra20, 1
-+-mov ra21, 64
-++mov ra21, 32
-+ mov ra22, 256
-+ mov ra23, 8
-+ 
-+-- 
-+2.5.0
-+
-+
-+From b558abbe8e70ebb5d75988e2cd21976474a2d4eb Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 14:37:32 +0100
-+Subject: [PATCH 19/68] Removed unused luma functions
-+
-+---
-+ libavcodec/hevc.c          |    4 +-
-+ libavcodec/rpi_qpu.c       |   32 +-
-+ libavcodec/rpi_shader.c    | 1097 +++++++++++++-------------------------------
-+ libavcodec/rpi_shader.h    |   19 +-
-+ libavcodec/rpi_shader.qasm |  970 +++------------------------------------
-+ 5 files changed, 396 insertions(+), 1726 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index bcc831e..3967361 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2718,8 +2718,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+         return;
-+     for(k=0;k<8;k++) {
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // Also need a dummy for V
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+         assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-+     }
-+ 
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 4d9eda8..4e90cc1 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -172,7 +172,7 @@ static int gpu_init(volatile struct GPU **gpu) {
-+ 
-+   // Now copy over the QPU code into GPU memory
-+   {
-+-    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP);
-++    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
-+     assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-+   }
-+@@ -612,24 +612,24 @@ unsigned int qpu_get_fn(int num) {
-+       gpu_unlock();
-+     }
-+     switch(num) {
-+-    case QPU_MC_SETUP:
-+-      fn = mc_setup;
-+-      break;
-+-    case QPU_MC_FILTER:
-+-      fn = mc_filter;
-+-      break;
-++    //case QPU_MC_SETUP:
-++    //  fn = mc_setup;
-++    //  break;
-++    //case QPU_MC_FILTER:
-++    //  fn = mc_filter;
-++    //  break;
-+     case QPU_MC_EXIT:
-+       fn = mc_exit;
-+       break;
-+-    case QPU_MC_INTERRUPT_EXIT:
-+-      fn = mc_interrupt_exit;
-+-      break;
-+-    case QPU_MC_FILTER_B:
-+-      fn = mc_filter_b;
-+-      break;
-+-    case QPU_MC_FILTER_HONLY:
-+-      fn = mc_filter_honly;
-+-      break;
-++    //case QPU_MC_INTERRUPT_EXIT:
-++    //  fn = mc_interrupt_exit;
-++    //  break;
-++    //case QPU_MC_FILTER_B:
-++    //  fn = mc_filter_b;
-++    //  break;
-++    //case QPU_MC_FILTER_HONLY:
-++    //  fn = mc_filter_honly;
-++    //  break;
-+     case QPU_MC_SETUP_UV:
-+       fn = mc_setup_uv;
-+       break;
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 831633b..170e8ac 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -21,798 +21,331 @@ __declspec(align(8))
-+ __attribute__((aligned(8)))
-+ #endif
-+ unsigned int rpi_shader[] = {
-+-// ::mc_setup
-++// ::mc_setup_uv
-+ /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+ /* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+ /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+ /* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+-/* [0x00000020] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00000028] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x00000030] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x00000038] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000040] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x00000048] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x00000050] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000058] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+-/* [0x00000060] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x00000068] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x00000070] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000078] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+-/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000000d8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000000e0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000000e8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x000000f0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x000000f8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000100] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000108] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000110] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000118] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000120] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000128] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000130] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000138] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000140] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000150] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000158] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000160] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000168] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000178] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x00000180] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000188] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x00000190] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x00000198] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000001a0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-++/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-++/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
-++/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-++/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
-++/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+ /* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001b0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x000001b8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x000001c0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+ /* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001d0] */ 0x4c9d00cf, 0x10024821, // add r0, r0, r3; mul24 r1, r1, rb_pitch
-+-/* [0x000001d8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001e8] */ 0x949dc5c0, 0xd0025890, // and r2, r2, ~3; mov ra_x_base, r0
-+-/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000200] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000210] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000218] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000220] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000228] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000230] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000238] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000240] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000248] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000250] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000258] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000260] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000268] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000270] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000278] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000280] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000288] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000290] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000298] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002d0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000002d8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000002e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000002f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000300] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000330] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000340] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000370] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000378] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000380] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000388] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000390] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000398] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000440] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000448] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000450] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000458] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000460] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000468] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000470] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000478] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000480] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000488] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000490] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000498] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000004a0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000004a8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000004b0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000004b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000004c0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x000004c8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004d0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000004d8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000004e0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000004e8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x000004f0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x000004f8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000500] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000508] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000510] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000518] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000520] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000528] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000530] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000538] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000540] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000548] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000550] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000558] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000560] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000568] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000570] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000578] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000580] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000588] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000590] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000598] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000005a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000005a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_filter
-+-/* [0x000005b0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005b8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005c0] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x000005c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005d0] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005f0] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x000005f8] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000600] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000608] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000610] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000618] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000620] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000628] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000630] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000638] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000648] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000650] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000658] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000660] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000668] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000670] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000678] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000680] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000688] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000690] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000698] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000006a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000006c0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006d0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006d8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000006e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000700] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000708] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000710] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000718] */ 0x000001d0, 0xf07809e7, // brr.anynn -, r:fast_path
-+-/* [0x00000720] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000728] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000730] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000738] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :loop
-+-/* [0x00000740] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000748] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000750] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000758] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000760] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000768] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000770] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000778] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000780] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000788] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000798] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x000007a0] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000007a8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000007b0] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000007b8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007c0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000007c8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007d0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000007d8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007e0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000007e8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007f0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000007f8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000800] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000808] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000810] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x00000818] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000820] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000828] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000830] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000838] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000840] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000848] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000850] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000858] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:loop
-+-/* [0x00000860] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000868] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000870] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000878] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000880] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000888] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000890] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000898] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x000008a0] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x000008a8] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x000008b0] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008b8] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x000008c0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008c8] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:loop
-+-/* [0x000008d0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000008d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000008e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000008f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000900] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// :fast_path
-+-/* [0x00000908] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :fast_loop
-+-/* [0x00000910] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000918] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000920] */ 0x95727d9b, 0x1004475f, // mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+-/* [0x00000928] */ 0x95690dbf, 0x10044623, // mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+-/* [0x00000930] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000938] */ 0x929de5e4, 0x100248a1, // min r2, r2, rb_frame_height_minus_1 ; mov r1, r4
-+-/* [0x00000940] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000948] */ 0xec414c87, 0x10024e20, // add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+-/* [0x00000950] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000958] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000960] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+-/* [0x00000968] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+-/* [0x00000970] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+-/* [0x00000978] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+-/* [0x00000980] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+-/* [0x00000988] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+-/* [0x00000990] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+-/* [0x00000998] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+-/* [0x000009a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x000009a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000009b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000009b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000009c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000009c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1, ra22
-+-/* [0x000009d8] */ 0xffffff18, 0xf06809e7, // brr.anyn -, r:fast_loop
-+-/* [0x000009e0] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x000009e8] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x000009f0] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x000009f8] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000a00] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000a08] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000a10] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000a18] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000a20] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000a28] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000a30] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000a38] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00000a40] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000a48] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:fast_loop
-+-/* [0x00000a50] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x00000a58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a68] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a70] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a78] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a80] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_filter_b
-+-/* [0x00000a88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000a90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000a98] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x00000aa0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000aa8] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000ab0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000ab8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x00000ac0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000ac8] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x00000ad0] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000ad8] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000ae0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000ae8] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000af0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000af8] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000b00] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000b08] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000b10] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000b18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000b20] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000b28] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000b30] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000b38] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000b40] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000b48] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000b50] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000b58] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000b60] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000b68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000b70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000b78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000b80] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+-/* [0x00000b88] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000b90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000b98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ba0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ba8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000bb0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bb8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bc0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000bc8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000bd0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bd8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000be0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000be8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000bf0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000bf8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000c00] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000c08] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000c10] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000c18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000c20] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :bloop
-+-/* [0x00000c28] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000c30] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000c38] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000c40] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000c48] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000c50] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000c58] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000c60] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000c68] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000c70] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000c78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000c80] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000c88] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000c90] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000c98] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000ca0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000ca8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000cb0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000cb8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000cc0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000cc8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000cd0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000cd8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000ce0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000ce8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000cf0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000cf8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x00000d00] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000d08] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000d10] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000d18] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000d20] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000d28] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000d30] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000d38] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000d40] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:bloop
-+-/* [0x00000d48] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000d50] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000d58] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000d60] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000d68] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000d70] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000d78] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000d80] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000d88] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000d90] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000d98] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000da0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x00000da8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000db0] */ 0x8fc8f3f6, 0xd0020867, // asr r1, r1, 15          ; mov -, vr_wait
-+-/* [0x00000db8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000dc0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x00000dc8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:bloop
-+-/* [0x00000dd0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000dd8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00000de0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x00000de8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000df0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000df8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000e00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_filter_honly
-+-/* [0x00000e08] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000e10] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000e18] */ 0x156e7d80, 0x10020667, // mov ra_x2shift, ra_x2shift_next
-+-/* [0x00000e20] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000e28] */ 0x0c9c81c0, 0xd00208a7, // add r2, r0, 8
-+-/* [0x00000e30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000e38] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3,unif
-+-/* [0x00000e40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000e48] */ 0x139c05c0, 0xd00208a7, // max r2, r2, 0
-+-/* [0x00000e50] */ 0x129d95c0, 0x100208a7, // min r2, r2, rb_frame_width_minus_1
-+-/* [0x00000e58] */ 0x119c35c0, 0xd00206e7, // shl ra_x2shift_next, r2, 3
-+-/* [0x00000e60] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000e68] */ 0x0c9e74c0, 0x100208a7, // add r2, r2, r3
-+-/* [0x00000e70] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000e78] */ 0x149dc5c0, 0xd00206a7, // and ra_x2_base_next, r2, ~3
-+-/* [0x00000e80] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000e88] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000e90] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000e98] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000ea0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000ea8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000eb0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000eb8] */ 0x0c9de1c0, 0xd0021467, // add rb17, r0, -2
-+-/* [0x00000ec0] */ 0x919c71c0, 0xd0024812, // shl r0, r0, 7 ; mov rb18,r0
-+-/* [0x00000ec8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000ed0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000ed8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000ee0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000ee8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ef0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ef8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f00] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000f08] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f10] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f18] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f20] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000f30] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000f38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :loop_honly
-+-/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000f50] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000f68] */ 0xee654987, 0x10024860, // shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20
-+-/* [0x00000f70] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000f78] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000f80] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000f88] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000f90] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000f98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000fa0] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000fa8] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000fb0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000fb8] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000fc0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000fc8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000fd0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000fd8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000fe0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000fe8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000ff0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000ff8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001000] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001008] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001010] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001018] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001020] */ 0x8d5927f6, 0x100269e1, // sub.setf -, r3, rb18 ; mov r1, ra22
-+-/* [0x00001028] */ 0x559f2fc1, 0x100049e0, // mov -, vw_wait   ; mul24 r0, r0, r1
-+-/* [0x00001030] */ 0xfffffef8, 0xf06809e7, // brr.anyn -, r:loop_honly
-+-/* [0x00001038] */ 0x0f9cf1c0, 0xd0020827, // asr r0, r0, 15
-+-/* [0x00001040] */ 0x129d61c0, 0x10020827, // min r0, r0, rb22
-+-/* [0x00001048] */ 0x139c01c0, 0xd0020c27, // max vpm, r0, 0
-+-/* [0x00001050] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001058] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001060] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001068] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_exit
-+-/* [0x00001070] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001078] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00001080] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001088] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001090] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001098] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000010a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x000010b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_exit1
-+-/* [0x000010b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000010c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010c8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010d8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000010e0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000010e8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000010f0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_interrupt_exit
-+-/* [0x000010f8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001100] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001108] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001118] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001120] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001128] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001178] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001180] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00001188] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_interrupt_exit4
-+-/* [0x00001190] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001198] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011b0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000011d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000011d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000011e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_interrupt_exit8
-+-/* [0x000011e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000011f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000011f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001200] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001210] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001218] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001220] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001228] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001230] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001238] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001240] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001248] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00001250] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00001258] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_setup_uv
-+-/* [0x00001260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001268] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-+-/* [0x00001270] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+-/* [0x00001278] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-+-/* [0x00001280] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00001288] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-+-/* [0x00001290] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00001298] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x000012a0] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x000012a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000012b0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x000012b8] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x000012c0] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x000012c8] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+-/* [0x000012d0] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x000012d8] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x000012e0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x000012e8] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+-/* [0x000012f0] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x000012f8] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00001300] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x00001308] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x00001310] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x00001318] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x00001320] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x00001328] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x00001330] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x00001338] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x00001340] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00001348] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00001350] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00001358] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00001360] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00001368] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00001370] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00001378] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001380] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00001388] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00001390] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00001398] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000013a0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000013a8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000013b0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x000013b8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x000013c0] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x000013c8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x000013d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000013d8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x000013e0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x000013e8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x000013f0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000013f8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x00001400] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x00001408] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x00001410] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00001418] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00001420] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x00001428] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00001430] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x00001438] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x00001440] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00001448] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001450] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001458] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001460] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00001468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00001470] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00001478] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001480] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00001488] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00001490] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00001498] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000014a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000014a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000014b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000014b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000014c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000014c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000014d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000014d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000014e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000014e8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000014f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000014f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00001500] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001508] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00001510] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00001518] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00001520] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00001528] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00001530] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00001538] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00001540] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00001548] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00001550] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001558] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001560] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00001568] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-+-/* [0x00001570] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00001578] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001580] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001588] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001590] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001598] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000015a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015a8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015b8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000015c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000015d8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000015e0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015e8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015f0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000015f8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00001600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00001608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00001618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00001620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00001628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00001630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00001638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00001640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00001648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00001650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00001658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00001660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00001668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001670] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00001678] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-/* [0x00001680] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00001688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00001690] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00001698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000016a0] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000016a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000016b0] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000016b8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000016c0] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000016c8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000016d0] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000016d8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000016e0] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000016e8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x000016f0] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-+-/* [0x000016f8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00001700] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00001708] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001710] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001718] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001720] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001728] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001730] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00001738] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00001740] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00001748] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00001750] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00001758] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00001760] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00001768] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00001770] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00001778] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00001780] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00001788] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00001790] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00001798] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-/* [0x000017a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000017a8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000017b0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000017b8] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x000017c0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000017c8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x000017d0] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x000017d8] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x000017e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000017e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000017f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000017f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00001800] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00001808] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001810] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00001818] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001820] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-++/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-++/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-++/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-++/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-++/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-++/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-++/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_exit
-++/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit8
-++/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 3464cdb..9de4535 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -3,18 +3,11 @@
-+ 
-+ extern unsigned int rpi_shader[];
-+ 
-+-#define mc_setup (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 146)
-+-#define mc_filter (rpi_shader + 364)
-+-#define mc_filter_b (rpi_shader + 674)
-+-#define mc_filter_honly (rpi_shader + 898)
-+-#define mc_exit (rpi_shader + 1052)
-+-#define mc_exit1 (rpi_shader + 1070)
-+-#define mc_interrupt_exit (rpi_shader + 1086)
-+-#define mc_interrupt_exit4 (rpi_shader + 1124)
-+-#define mc_interrupt_exit8 (rpi_shader + 1146)
-+-#define mc_setup_uv (rpi_shader + 1176)
-+-#define mc_filter_uv_b (rpi_shader + 1318)
-+-#define mc_end (rpi_shader + 1546)
-++#define mc_setup_uv (rpi_shader + 0)
-++#define mc_filter_uv (rpi_shader + 142)
-++#define mc_filter_uv_b (rpi_shader + 360)
-++#define mc_exit (rpi_shader + 588)
-++#define mc_interrupt_exit8 (rpi_shader + 606)
-++#define mc_end (rpi_shader + 636)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 4809e1d..cd7346d 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -71,8 +71,10 @@
-+ 
-+ .set rb_const_64,                  rb21
-+ 
-+-# mc_setup(next_kernel, x, y, ref_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1)
-+-::mc_setup
-++
-++################################################################################
-++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-++::mc_setup_uv
-+ 
-+ # Read starting kernel
-+ mov ra31, unif
-+@@ -80,7 +82,9 @@ mov ra31, unif
-+ # Load first request location
-+ add ra_x_base, unif, elem_num # Store x
-+ mov ra_y, unif # Store y
-+-mov ra_x2_base, unif # Store frame base
-++mov ra_x2_base, unif # Store frame u base
-++nop
-++sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-+ 
-+ # Read image dimensions
-+ sub rb25,unif,1
-+@@ -143,29 +147,24 @@ mov r1, vpm_setup(0, 4, h8p(0, 0))
-+ add rb28, r0, r1
-+ 
-+ # Compute base address for first and second access
-+-#add r0, unif, elem_num     # x
-+ mov r0, ra_x_base           # Load x
-+-add r2, r0, 8               # x+8
-+ max r0, r0, 0; mov r1, ra_y # Load y
-+ min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-+-shl ra_xshift_next, r0, 3
-+-max r2, r2, 0
-++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+ add ra_y, r1, 1
-+-min r2, r2, rb_frame_width_minus_1
-+-shl ra_x2shift_next, r2, 3
-+-max r1, r1, 0  # y
-+-min r1, r1, rb_frame_height_minus_1
-+-add r0, r0, r3; mul24 r1, r1, rb_pitch
-+-add r2, r2, r3
-++add r0, r0, r3
-+ and r0, r0, ~3
-+-and r2, r2, ~3; mov ra_x_base, r0
-++max r1, r1, 0 ; mov ra_x_base, r0 # y
-++min r1, r1, rb_frame_height_minus_1
-+ # submit texture requests for first line
-++add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+ add t0s, r0, r1 ; mov ra_x2_base, r2
-+ add t0s, r2, r1
-+ 
-+ # Dump padding words
-+ mov r0, unif
-+ mov r0, unif
-++mov r0, unif
-+ 
-+ # submit texture requests for second line
-+ max r1, ra_y, 0
-+@@ -176,6 +175,8 @@ nop ; mul24 r1, r1, rb_pitch
-+ add t0s, r1, ra_x_base
-+ add t0s, r1, ra_x2_base
-+ 
-++
-++
-+ ################################################################################
-+ 
-+ # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+@@ -341,453 +342,26 @@ add vw_setup, rb26, r0 # VDW setup 0
-+ mov vw_setup, rb29 # Stride
-+ mov vw_addr, unif # start the VDW
-+ 
-+-################################################################################
-+-
-+-
-+-# mc_filter(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+-
-+-# At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-+-::mc_filter
-+-mov ra31, unif
-+-
-+-# per-channel shifts were calculated on the *previous* invocation
-+-
-+-mov ra_xshift, ra_xshift_next
-+-mov ra_x2shift, ra_x2shift_next
-+-
-+-# get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num    # x
-+-add r2, r0, 8 # x+8
-+-max r0, r0, 0; mov r1, unif # y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-+-shl ra_xshift_next, r0, 3
-+-max r2, r2, 0
-+-min r2, r2, rb_frame_width_minus_1
-+-shl ra_x2shift_next, r2, 3
-+-add r0, r0, r3
-+-add r2, r2, r3
-+-and rb_x_base_next, r0, ~3
-+-and ra_x2_base_next, r2, ~3
-+-mov ra_y_next, r1
-+-
-+-# set up VPM write
-+-mov vw_setup, rb28
-+-
-+-# get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-+-shl r0, r0, 7
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-+-
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+-# get filter coefficients
-+-
-+-mov r0, unif
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-+-asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-brr.anynn -, r:fast_path
-+-asr rb12, r0, rb23  # delay slot 1
-+-
-+-# r2 is elem_num
-+-# r3 is loop counter
-+-
-+-mov r5rep, -8 # delay slot 2
-+-
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-+-
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-## nop                                                                 ; ldtmu0     # loop counter increment
-+-## shr r0, r4, ra17                                                    ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21         ; mul24 r3, r0, ra0
-+-## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-## sub r2, r2, r3                                                    ; ldtmu0
-+-##
-+-## mov r0, ra22
-+-## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # apply horizontal filter
-+-##
-+-## asr r2, r2, 15    ; mul24 r3, r0, ra0
-+-## min r2, r2, rb22
-+-## max ra13, r2, 0
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21
-+-## sub r2, r2, r3 ; mul24      r3, ra1 << 1, r0 << 1
-+-## nop            ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+-## nop            ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+-## nop            ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+-## nop            ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+-## nop            ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+-## nop            ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+-## nop            ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-## sub r0, r2, r3
-+-##
-+-## # apply horizontal filter
-+-##
-+-## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+-## asr r0, r0, 15
-+-## min r0, r0, rb22
-+-## max ra14, r0, 0
-+-##
-+-##
-+-##
-+-##
-+-## nop                                                                 ; ldtmu0     # loop counter increment
-+-## shr r0, r4, ra17                                                    ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21         ; mul24 r3, r0, ra0
-+-## sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-## nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-## sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-## nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-## sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-## nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-## sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-## nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-## sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-## nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-## sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-## nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-## sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-## nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-## sub r0, r2, r3
-+-##
-+-## # apply horizontal filter
-+-##
-+-## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+-## asr r0, r0, 15
-+-## min r0, r0, rb22
-+-## max ra15, r0, 0
-+-
-+-
-+-
-+-
-+-mov r3, 0
-+-
-+-:loop
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-+-
-+-# generate seven shifted versions
-+-# interleave with scroll of vertical context
-+-
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 8 ; mov r1, ra22
-+-
-+-# apply horizontal filter
-+-brr.anyn -, r:loop
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-+-
-+-# apply vertical filter and write to VPM
-+-
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-brr.anyn -, r:loop
-+-asr r1, r1, 15
-+-min r1, r1, rb22
-+-max vpm, r1, 0
-+-
-+-# DMA out
-+-
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+-mov vw_setup, rb29
-+-mov vw_addr, unif # start the VDW
-+-
-+-####################################################
-+-
-+-:fast_path
-+-## nop                                                                 ; ldtmu0     # loop counter increment
-+-## shr r0, r4, ra17                                                    ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21         ; mul24 r3, r0, ra0
-+-## sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+-## sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+-## sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+-## sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+-## sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+-## sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+-## sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+-## sub r2, r2, r3                                                    ; ldtmu0
-+-##
-+-## mov r0, ra22
-+-## shr r0, r4, ra17     ; mul24 r2, r2, r0                           ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # apply horizontal filter
-+-##
-+-## asr r2, r2, 15    ; mul24 r3, r0, ra0
-+-## min r2, r2, rb22
-+-## max ra13, r2, 0
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21
-+-## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-+-## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+-## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+-## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+-## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+-## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+-## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+-## sub r0, r2, r3
-+-##
-+-## # apply horizontal filter
-+-##
-+-## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+-## asr r0, r0, 15
-+-## min r0, r0, rb22
-+-## max ra14, r0, 0
-+-##
-+-##
-+-##
-+-##
-+-## nop                                                                 ; ldtmu0     # loop counter increment
-+-## shr r0, r4, ra17                                                    ; ldtmu0
-+-## shr r1, r4, ra17     ; v8subs r0, r0, rb20
-+-## add t0s, ra16, r5    ; v8subs r1, r1, rb20
-+-## add ra16, ra16, rb16 ; mov t0s, ra16
-+-##
-+-## # generate seven shifted versions
-+-## # interleave with scroll of vertical context
-+-##
-+-## mov r2, rb21   ; mul24    r3, r0, ra0
-+-## sub r2, r2, r3 ; mul24    r3, ra1 << 1, r0 << 1
-+-## sub r2, r2, r3 ; mul24    r3, ra2 << 2, r0 << 2
-+-## sub r2, r2, r3 ; mul24    r3, ra3 << 3, r0 << 3
-+-## sub r2, r2, r3 ; mul24    r3, ra4 << 4, r0 << 4
-+-## sub r2, r2, r3 ; mul24    r3, ra5 << 5, r0 << 5
-+-## sub r2, r2, r3 ; mul24    r3, ra6 << 6, r0 << 6
-+-## sub r2, r2, r3 ; mul24    r3, ra7 << 7, r0 << 7
-+-## sub r0, r2, r3
-+-##
-+-## # apply horizontal filter
-+-##
-+-## nop          ; mul24 r0, r0, ra22         # last bit of context scroll, including clamp to zero
-+-## asr r0, r0, 15
-+-## min r0, r0, rb22
-+-## max ra15, r0, 0
-+-
-+-
-+-mov r3, 0  # This signifies the amount of unrolling
-+-
-+-:fast_loop
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-# Due to pipelining we can only skip second pipeline instructions related to the fetched pixels
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_y, ra_y_next   ; mov rb31, r3
-+-mov.ifz ra_x2_base, ra_x2_base_next   ; mov r3, rb_pitch
-+-
-+-max r2, ra_y, 0
-+-min r2, r2, rb_frame_height_minus_1 ; mov r1, r4  # discard texture read
-+-add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r0, r0, rb20
-+-add t0s, ra_x2_base, r2
-+-
-+-# generate seven shifted versions
-+-# interleave with scroll of vertical context
-+-
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-sub r2, r2, r3       ; mul24 r3, ra1 << 1, r0 << 1
-+-sub r2, r2, r3       ; mul24 r3, ra2 << 2, r0 << 2
-+-sub r2, r2, r3       ; mul24 r3, ra3 << 3, r0 << 3
-+-sub r2, r2, r3       ; mul24 r3, ra4 << 4, r0 << 4
-+-sub r2, r2, r3       ; mul24 r3, ra5 << 5, r0 << 5
-+-sub r2, r2, r3       ; mul24 r3, ra6 << 6, r0 << 6
-+-sub r2, r2, r3       ; mul24 r3, ra7 << 7, r0 << 7
-+-sub r0, r2, r3       ; mov r3, rb31
-+-
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 8       ; mov r1, ra22
-+-
-+-# apply horizontal filter
-+-
-+-brr.anyn -, r:fast_loop
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-+-
-+-# apply vertical filter and write to VPM
-+-
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-brr.anyn -, r:fast_loop
-+-asr r1, r1, 15
-+-min r1, r1, rb22
-+-max vpm, r1, 0
-+-
-+-# DMA out
-+-
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+-mov vw_setup, rb29
-+-mov vw_addr, unif # start the VDW
-+ 
-+ ################################################################################
-+ 
-+-# mc_filter_b(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+-
-+-# At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-+-::mc_filter_b
-++::mc_filter_uv_b
-+ mov ra31, unif
-+ 
-+ # per-channel shifts were calculated on the *previous* invocation
-+ 
-+ mov ra_xshift, ra_xshift_next
-+-mov ra_x2shift, ra_x2shift_next
-+ 
-+ # get base addresses and per-channel shifts for *next* invocation
-+ add r0, unif, elem_num    # x
-+-add r2, r0, 8 # x+8
-+ max r0, r0, 0; mov r1, unif # y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+ shl ra_xshift_next, r0, 3
-+-max r2, r2, 0
-+-min r2, r2, rb_frame_width_minus_1
-+-shl ra_x2shift_next, r2, 3
-++sub r2, unif, r3 # compute offset from frame base u to frame base v
-+ add r0, r0, r3
-+-add r2, r2, r3
-+ and rb_x_base_next, r0, ~3
-+-and ra_x2_base_next, r2, ~3
-+ mov ra_y_next, r1
-++add ra_x2_base_next, rb_x_base_next, r2
-+ 
-+ # set up VPM write
-+ mov vw_setup, rb28
-+@@ -801,17 +375,22 @@ and r0, r0, rb22 # Extract height
-+ add rb17, r0, 5
-+ add rb18, r0, 7
-+ shl r0, r0, 7
-++
-+ # r0 is currently height<<7
-+ # For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+ shl r3, r0, 13
-+ shl r3, r3, 8 # Mask off top 8 bits
-+ shr r3, r3, 8
-++
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb26, r0, rb27
-++
-+ # In a B frame, so also set up VPM read
-+ add vr_setup, r3, rb28
-+ 
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-+ # get filter coefficients
-+ 
-+ mov r0, unif
-+@@ -837,9 +416,13 @@ asr rb12, r0, rb23
-+ 
-+ mov r5rep, -8
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-+ mov r3, 0
-+ 
-+-:bloop
-++:uvloop_b
-+ # retrieve texture results and pick out bytes
-+ # then submit two more texture requests
-+ 
-+@@ -847,7 +430,7 @@ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+ shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+ mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+ 
-+ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+@@ -861,6 +444,7 @@ add t0s, ra_x2_base, r2
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+ mov r2, rb21         ; mul24 r3, r0, ra0
-++nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+ sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+ nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+ sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+@@ -889,7 +473,7 @@ mov ra13, ra14
-+ sub.setf -, r3, 8 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+-brr.anyn -, r:bloop
-++brr.anyn -, r:uvloop_b
-+ max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+ asr r0, r0, 15          ; mov r1, ra21
-+ min.setf ra15, r0, rb22
-+@@ -906,213 +490,50 @@ sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+ sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+ sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 15          ; mov -, vr_wait
-++asr r1, r1, 15
-+ min r1, r1, rb22
-+ add r0, vpm, 1          # Blend in previous VPM contents at this location
-+-brr.anyn -, r:bloop
-++brr.anyn -, r:uvloop_b
-+ max r1, r1, 0
-+ add r1, r1, r0
-+ shr vpm, r1, 1
-+ 
-+-# DMA out
-++
-++# DMA out for U
-++
-++mov vw_setup, rb26 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++# DMA out for V
-++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-++# Could potentially push this write into the start of the next pipeline stage.
-++mov r0, 16
-++mov -, vw_wait
-+ 
-+ bra -, ra31
-+-mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+-mov vw_setup, rb29
-++add vw_setup, rb26, r0 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-+ mov vw_addr, unif # start the VDW
-+ 
-+ ################################################################################
-+ 
-+-# mc_filter_honly(next_kernel, x, y, frame_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_dst)
-+-# This filter only does horizontal filtering.
-+-# It is assumed that the region to fetch does not include extra rows above.
-++# mc_exit()
-+ 
-+-# At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-+-::mc_filter_honly
-+-mov ra31, unif
-++::mc_exit
-++mov  -, vw_wait # wait on the VDW
-+ 
-+-# per-channel shifts were calculated on the *previous* invocation
-++mov -,srel(0)
-+ 
-+-mov ra_xshift, ra_xshift_next
-+-mov ra_x2shift, ra_x2shift_next
-+-
-+-# get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num    # x
-+-add r2, r0, 8 # x+8
-+-max r0, r0, 0; mov r1, unif # y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3,unif # frame_base
-+-shl ra_xshift_next, r0, 3
-+-max r2, r2, 0
-+-min r2, r2, rb_frame_width_minus_1
-+-shl ra_x2shift_next, r2, 3
-+-add r0, r0, r3
-+-add r2, r2, r3
-+-and rb_x_base_next, r0, ~3
-+-and ra_x2_base_next, r2, ~3
-+-mov ra_y_next, r1
-+-
-+-# set up VPM write
-+-mov vw_setup, rb28
-+-
-+-# get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, -2 # Pipelining means we move data across 2 iterations early
-+-shl r0, r0, 7 ; mov rb18,r0
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-+-
-+-# get filter coefficients
-+-
-+-mov r0, unif
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-+-mov r0, unif
-+-
-+-# r2 is elem_num
-+-# r3 is loop counter
-+-mov r5rep, -8
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]  # delay slot 3
-+-mov r3, 0
-+-
-+-:loop_honly
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, ra_x2shift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-+-
-+-# generate seven shifted versions
-+-# interleave with scroll of vertical context
-+-
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3       ; mov r3, rb31
-+-
-+-sub.setf -, r3, rb18 ; mov r1, ra22
-+-
-+-mov -, vw_wait   ; mul24 r0, r0, r1
-+-brr.anyn -, r:loop_honly
-+-asr r0, r0, 15          # delay 1
-+-min r0, r0, rb22        # delay 2
-+-max vpm, r0, 0          # delay 3
-+-
-+-# DMA out
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW: height rows, 16 8-bit units long
-+-mov vw_setup, rb29
-+-mov vw_addr, unif # start the VDW
-+-
-+-
-+-################################################################################
-+-
-+-# mc_exit()
-+-
-+-::mc_exit
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-mov -,srel(0)
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-+ 
-+ nop        ; nop ; thrend
-+ nop        ; nop # delay slot 1
-+ nop        ; nop # delay slot 2
-+ 
-+-::mc_exit1
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-#mov -,srel(1)
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-
-+-nop        ; nop ; thrend
-+-mov interrupt, 1; nop # delay slot 1
-+-nop        ; nop # delay slot 2
-+-
-+-# mc_interrupt_exit()
-+-::mc_interrupt_exit
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-
-+-mov -,sacq(0) # 1
-+-mov -,sacq(0) # 2
-+-mov -,sacq(0) # 3
-+-mov -,sacq(0) # 4
-+-mov -,sacq(0) # 5
-+-mov -,sacq(0) # 6
-+-mov -,sacq(0) # 7
-+-mov -,sacq(0) # 8
-+-mov -,sacq(0) # 9
-+-mov -,sacq(0) # 10
-+-mov -,sacq(0) # 11
-+-
-+-nop        ; nop ; thrend
-+-mov interrupt, 1; nop # delay slot 1
-+-nop        ; nop # delay slot 2
-+-
-+-# mc_interrupt_exit4()
-+-::mc_interrupt_exit4
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-
-+-mov -,sacq(0) # 1
-+-mov -,sacq(0) # 2
-+-mov -,sacq(0) # 3
-+-
-+-nop        ; nop ; thrend
-+-mov interrupt, 1; nop # delay slot 1
-+-nop        ; nop # delay slot 2
-+-
-+ # mc_interrupt_exit8()
-+ ::mc_interrupt_exit8
-+ mov  -, vw_wait # wait on the VDW
-+@@ -1134,282 +555,5 @@ nop        ; nop ; thrend
-+ mov interrupt, 1; nop # delay slot 1
-+ nop        ; nop # delay slot 2
-+ 
-+-################################################################################
-+-# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-+-::mc_setup_uv
-+-
-+-# Read starting kernel
-+-mov ra31, unif
-+-
-+-# Load first request location
-+-add ra_x_base, unif, elem_num # Store x
-+-mov ra_y, unif # Store y
-+-mov ra_x2_base, unif # Store frame u base
-+-nop
-+-sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-+-
-+-# Read image dimensions
-+-sub rb25,unif,1
-+-sub rb30,unif,1
-+-
-+-# get source pitch
-+-mov rb16, unif
-+-
-+-# get destination pitch
-+-mov r0, unif
-+-mov r1, vdw_setup_1(0)
-+-add rb24, r1, r0
-+-
-+-# load constants
-+-
-+-mov ra20, 1
-+-mov ra21, 32
-+-mov ra22, 256
-+-mov ra23, 8
-+-
-+-mov rb20, 0xffffff00
-+-mov rb21, 64
-+-mov rb22, 255
-+-mov rb23, 24
-+-
-+-# touch vertical context to keep simulator happy
-+-
-+-mov ra8, 0
-+-mov ra9, 0
-+-mov ra10, 0
-+-mov ra11, 0
-+-mov ra12, 0
-+-mov ra13, 0
-+-mov ra14, 0
-+-mov ra15, 0
-+-
-+-# Compute part of VPM to use for DMA output
-+-mov r2, qpu_num
-+-and r2, r2, 15
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+-shl r0, r0, 5
-+-add rb27, r0, r1
-+-
-+-# Compute part of VPM to save data into
-+-mov r2, qpu_num
-+-and r2, r2, 15
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-add rb28, r0, r1
-+-
-+-# Compute base address for first and second access
-+-mov r0, ra_x_base           # Load x
-+-max r0, r0, 0; mov r1, ra_y # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-+-shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-add ra_y, r1, 1
-+-add r0, r0, r3
-+-and r0, r0, ~3
-+-max r1, r1, 0 ; mov ra_x_base, r0 # y
-+-min r1, r1, rb_frame_height_minus_1
-+-# submit texture requests for first line
-+-add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-add t0s, r0, r1 ; mov ra_x2_base, r2
-+-add t0s, r2, r1
-+-
-+-# Dump padding words
-+-mov r0, unif
-+-mov r0, unif
-+-mov r0, unif
-+-
-+-# submit texture requests for second line
-+-max r1, ra_y, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1
-+-bra -, ra31
-+-nop ; mul24 r1, r1, rb_pitch
-+-add t0s, r1, ra_x_base
-+-add t0s, r1, ra_x2_base
-+-
-+-
-+-
-+-################################################################################
-+-
-+-::mc_filter_uv_b
-+-mov ra31, unif
-+-
-+-# per-channel shifts were calculated on the *previous* invocation
-+-
-+-mov ra_xshift, ra_xshift_next
-+-
-+-# get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num    # x
-+-max r0, r0, 0; mov r1, unif # y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+-shl ra_xshift_next, r0, 3
-+-sub r2, unif, r3 # compute offset from frame base u to frame base v
-+-add r0, r0, r3
-+-and rb_x_base_next, r0, ~3
-+-mov ra_y_next, r1
-+-add ra_x2_base_next, rb_x_base_next, r2
-+-
-+-# set up VPM write
-+-mov vw_setup, rb28
-+-
-+-# get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-+-shl r0, r0, 7
-+-
-+-# r0 is currently height<<7
-+-# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+-shl r3, r0, 13
-+-shl r3, r3, 8 # Mask off top 8 bits
-+-shr r3, r3, 8
-+-
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-+-
-+-# In a B frame, so also set up VPM read
-+-add vr_setup, r3, rb28
-+-
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+-# get filter coefficients
-+-
-+-mov r0, unif
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-+-asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb12, r0, rb23
-+-
-+-# r2 is elem_num
-+-# r3 is loop counter
-+-
-+-mov r5rep, -8
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-mov r3, 0
-+-
-+-:uvloop_b
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-+-
-+-# generate seven shifted versions
-+-# interleave with scroll of vertical context
-+-
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 8 ; mov r1, ra22
-+-
-+-# apply horizontal filter
-+-brr.anyn -, r:uvloop_b
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-+-
-+-# apply vertical filter and write to VPM
-+-
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 15
-+-min r1, r1, rb22
-+-add r0, vpm, 1          # Blend in previous VPM contents at this location
-+-brr.anyn -, r:uvloop_b
-+-max r1, r1, 0
-+-add r1, r1, r0
-+-shr vpm, r1, 1
-+-
-+-
-+-# DMA out for U
-+-
-+-mov vw_setup, rb26 # VDW setup 0
-+-mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW
-+-
-+-# DMA out for V
-+-# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+-# Could potentially push this write into the start of the next pipeline stage.
-+-mov r0, 16
-+-mov -, vw_wait
-+-
-+-bra -, ra31
-+-add vw_setup, rb26, r0 # VDW setup 0
-+-mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW
-+-
-+ ::mc_end
-++# Do not add code here because mc_end must appear after all other code.
-+-- 
-+2.5.0
-+
-+
-+From f91f17a1cce2b0f6996569ee7cccf0c9768afd87 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 14:54:25 +0100
-+Subject: [PATCH 20/68] Moved chroma P1 to QPUs
-+
-+---
-+ libavcodec/hevc.c | 38 ++++++++++++++++++++++++++++++++++++++
-+ 1 file changed, 38 insertions(+)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 3967361..4dad0e0 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2057,6 +2057,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-++#ifdef RPI_INTER_QPU
-++            if (s->enable_rpi) {
-++                int reflist = 1;
-++                int hshift           = s->ps.sps->hshift[1];
-++                int vshift           = s->ps.sps->vshift[1];
-++                const Mv *mv         = &current_mv.mv[reflist];
-++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-++                intptr_t _mx         = mx << (1 - hshift);
-++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-++
-++                int x1_c = x0_c + (mv->x >> (2 + hshift));
-++                int y1_c = y0_c + (mv->y >> (2 + hshift));
-++                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++                int chan = x0>>8;
-++
-++                uint32_t *u = s->u_mvs[chan & 7];
-++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-++                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-++                      *u++ = rpi_filter_coefs[_mx][0];
-++                      *u++ = rpi_filter_coefs[_mx][1];
-++                      *u++ = rpi_filter_coefs[_my][0];
-++                      *u++ = rpi_filter_coefs[_my][1];
-++                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-++                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++                    }
-++                }
-++                s->u_mvs[chan & 7] = u;
-++                return;
-++            }
-++#endif
-+             RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-+                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
-+-- 
-+2.5.0
-+
-+
-+From 36aba6ea897093f6528658e78bf4deeba7eeecd2 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 13 May 2015 15:13:47 +0100
-+Subject: [PATCH 21/68] Added B prediction - not quite right
-+
-+---
-+ libavcodec/hevc.c          |  58 ++++++++++++++++++++++++
-+ libavcodec/rpi_shader.c    | 108 +++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |   6 +--
-+ libavcodec/rpi_shader.qasm |  48 ++++++++++----------
-+ 4 files changed, 141 insertions(+), 79 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 4dad0e0..eee617d 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2114,6 +2114,64 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                    ref1->frame, &current_mv.mv[1], &current_mv);
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-++#ifdef RPI_INTER_QPU
-++            if (s->enable_rpi) {
-++                int hshift           = s->ps.sps->hshift[1];
-++                int vshift           = s->ps.sps->vshift[1];
-++                const Mv *mv         = &current_mv.mv[0];
-++                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-++                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-++                intptr_t _mx         = mx << (1 - hshift);
-++                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-++                int x1_c = x0_c + (mv->x >> (2 + hshift));
-++                int y1_c = y0_c + (mv->y >> (2 + hshift));
-++
-++                const Mv *mv2         = &current_mv.mv[1];
-++                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
-++                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
-++                intptr_t _mx2         = mx2 << (1 - hshift);
-++                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
-++
-++                int x2_c = x0_c + (mv2->x >> (2 + hshift));
-++                int y2_c = y0_c + (mv2->y >> (2 + hshift));
-++
-++                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-++
-++                uint32_t *u = s->u_mvs[chan & 7];
-++                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-++                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-++                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = rpi_filter_coefs[_mx][0];
-++                      *u++ = rpi_filter_coefs[_mx][1];
-++                      *u++ = rpi_filter_coefs[_my][0];
-++                      *u++ = rpi_filter_coefs[_my][1];
-++                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
-++                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-++                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-++                      *u++ = rpi_filter_coefs[_mx2][0];
-++                      *u++ = rpi_filter_coefs[_mx2][1];
-++                      *u++ = rpi_filter_coefs[_my2][0];
-++                      *u++ = rpi_filter_coefs[_my2][1];
-++                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-++                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++                    }
-++                }
-++                s->u_mvs[chan & 7] = u;
-++                return;
-++            }
-++#endif
-+             RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-+                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
-+ 
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 170e8ac..5d00cb2 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -265,23 +265,23 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+ /* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+ /* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000778] */ 0x55015fc6, 0x100248a3, // mov r2, rb21         ; mul24 r3, r0, ra0
-+-/* [0x00000780] */ 0x40038031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-/* [0x00000788] */ 0x4d07f4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ /* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000798] */ 0x4d0be4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ /* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007a8] */ 0x4d0fd4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ /* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007b8] */ 0x4d13c4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+ /* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007c8] */ 0x4d17b4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+ /* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007d8] */ 0x4d1ba4f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+ /* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000007e8] */ 0x4d1f94f0, 0xd00248a3, // sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+ /* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x000007f8] */ 0x0d9e74c0, 0x10020827, // sub r0, r2, r3
-++/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+ /* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+ /* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+ /* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+@@ -291,61 +291,63 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+ /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+ /* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000848] */ 0x533c0dc1, 0xd00243a0, // max ra14, ra15, 0       ; mul24 r0, r0, r1
-+-/* [0x00000850] */ 0x8f54f1f6, 0xd0024821, // asr r0, r0, 15          ; mov r1, ra21
-+-/* [0x00000858] */ 0x129d61c0, 0x100223e7, // min.setf ra15, r0, rb22
-+-/* [0x00000860] */ 0x4038e037, 0x100049e0, // nop                     ; mul24 r0, ra14, rb14
-+-/* [0x00000868] */ 0x4d34d237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-/* [0x00000870] */ 0x4d30c237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000878] */ 0x4d2cb237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000880] */ 0x4d28a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000888] */ 0x4d249237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000890] */ 0x4d208237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000898] */ 0x4d3cf237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008a0] */ 0x8d9f223f, 0x100a0867, // sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+ /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008b0] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-+-/* [0x000008b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008c0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x000008c8] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000008d0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x000008d8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x000008e0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x000008e8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000008f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000900] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000908] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000910] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000918] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000920] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000928] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000930] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000940] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000948] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+ /* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000960] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000968] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000970] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000980] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000988] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000009e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000009e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 9de4535..e36c4ae 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -6,8 +6,8 @@ extern unsigned int rpi_shader[];
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 142)
-+ #define mc_filter_uv_b (rpi_shader + 360)
-+-#define mc_exit (rpi_shader + 588)
-+-#define mc_interrupt_exit8 (rpi_shader + 606)
-+-#define mc_end (rpi_shader + 636)
-++#define mc_exit (rpi_shader + 592)
-++#define mc_interrupt_exit8 (rpi_shader + 610)
-++#define mc_end (rpi_shader + 640)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index cd7346d..870437d2 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -443,23 +443,23 @@ add t0s, ra_x2_base, r2
-+ 
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+-mov r2, rb21         ; mul24 r3, r0, ra0
-+-nop                  ; mul24.ifnz r3, ra0 << 8, r1 << 8
-+-sub r2, r2, r3       ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+ nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-sub r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-sub r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-sub r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+ nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-sub r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+ nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-sub r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+ nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-sub r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+ nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-sub r0, r2, r3
-++add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+@@ -474,23 +474,25 @@ sub.setf -, r3, 8 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b
-+-max ra14, ra15, 0       ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr r0, r0, 15          ; mov r1, ra21
-+-min.setf ra15, r0, rb22
-++mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-++asr ra15, r0, 8         ; nop
-++nop                     ; nop
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r0, ra14, rb14
-+-sub r1, r1, r0          ; mul24 r0, ra13, rb13
-+-sub r1, r1, r0          ; mul24 r0, ra12, rb12
-+-sub r1, r1, r0          ; mul24 r0, ra11, rb11
-+-sub r1, r1, r0          ; mul24 r0, ra10, rb10
-+-sub r1, r1, r0          ; mul24 r0, ra9, rb9
-+-sub r1, r1, r0          ; mul24 r0, ra8, rb8
-+-sub r1, r1, r0          ; mul24 r0, ra15, rb15
-+-sub.ifnn r1, r1, r0     ; mov -, vw_wait
-++nop                     ; mul24 r1, ra14, rb14
-++nop                     ; mul24 r0, ra13, rb13
-++add r1, r1, r0          ; mul24 r0, ra12, rb12
-++add r1, r1, r0          ; mul24 r0, ra11, rb11
-++add r1, r1, r0          ; mul24 r0, ra10, rb10
-++add r1, r1, r0          ; mul24 r0, ra9, rb9
-++add r1, r1, r0          ; mul24 r0, ra8, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb15
-++add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 15
-++asr r1, r1, 14
-++add r1, r1, ra21
-++asr r1, r1, 6
-+ min r1, r1, rb22
-+ add r0, vpm, 1          # Blend in previous VPM contents at this location
-+ brr.anyn -, r:uvloop_b
-+-- 
-+2.5.0
-+
-+
-+From 5dec9ecc623e90c6e14b72a34a6bffdd2a005edb Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 08:15:55 +0100
-+Subject: [PATCH 22/68] Added flush for SAO
-+
-+---
-+ libavcodec/hevc.c        |  2 +-
-+ libavcodec/hevc_filter.c | 39 ++++++++++++++++++++++++++-------------
-+ 2 files changed, 27 insertions(+), 14 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index eee617d..58da57d 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2890,7 +2890,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+             rpi_execute_inter_qpu(s);
-+ #endif
-+             // Transform all blocks
-+-            //printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-++            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 9b6e26d..92a8271 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -871,6 +871,21 @@ static void flush_buffer(AVBufferRef *bref) {
-+     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+     gpu_cache_flush(p);
-+ }
-++
-++static void ff_hevc_flush_chroma(HEVCContext *s)
-++{
-++    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-++            s->nal_unit_type == NAL_TSA_N   ||
-++            s->nal_unit_type == NAL_STSA_N  ||
-++            s->nal_unit_type == NAL_RADL_N  ||
-++            s->nal_unit_type == NAL_RASL_N )) {
-++        flush_buffer(s->frame->buf[1]);
-++        flush_buffer(s->frame->buf[2]);
-++        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-++        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-++        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-++    }
-++}
-+ #endif
-+ 
-+ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+@@ -886,31 +901,29 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x - ctb_size, y);
-+         if (y && x_end) {
-+             sao_filter_CTB(s, x, y - ctb_size);
-+-            if (s->threads_type & FF_THREAD_FRAME )
-++            if (s->threads_type & FF_THREAD_FRAME ) {
-++#ifdef RPI_INTER_QPU
-++                ff_hevc_flush_chroma(s);
-++#endif
-+                 ff_thread_report_progress(&s->ref->tf, y, 0);
-++            }
-+         }
-+         if (x_end && y_end) {
-+             sao_filter_CTB(s, x , y);
-+-            if (s->threads_type & FF_THREAD_FRAME )
-++            if (s->threads_type & FF_THREAD_FRAME ) {
-++#ifdef RPI_INTER_QPU
-++                ff_hevc_flush_chroma(s);
-++#endif
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-++            }
-+         }
-+     } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
-+         //int newh = y + ctb_size - 4;
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-+-        if (!(  s->nal_unit_type == NAL_TRAIL_N ||
-+-            s->nal_unit_type == NAL_TSA_N   ||
-+-            s->nal_unit_type == NAL_STSA_N  ||
-+-            s->nal_unit_type == NAL_RADL_N  ||
-+-            s->nal_unit_type == NAL_RASL_N )) {
-+ #ifdef RPI_INTER_QPU
-+-            flush_buffer(s->frame->buf[1]);
-+-            flush_buffer(s->frame->buf[2]);
-++        ff_hevc_flush_chroma(s);
-+ #endif
-+-            //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+-            //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+-            //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+-        }
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+ }
-+-- 
-+2.5.0
-+
-+
-+From 015b41d06a02e23c7937f6c91c4270b2bc2e48c9 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 09:17:28 +0100
-+Subject: [PATCH 23/68] Stopped using acceleration in unsupported cases
-+
-+---
-+ libavcodec/hevc.c       | 14 +++++++-------
-+ libavcodec/hevc_cabac.c |  4 ++--
-+ 2 files changed, 9 insertions(+), 9 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 58da57d..c59ee63 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -1139,15 +1139,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                         for (i = 0; i < (size * size); i++) {
-+                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+                         }
-+-                        printf("Cross component not supported\n"); // TODO
-+-                        exit(-1);
-+                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-+                     }
-+             }
-+ 
-+             if (lc->tu.cross_pf) {
-+-                printf("Cross component not supported\n"); // TODO
-+-                exit(-1);
-+                 hls_cross_component_pred(s, 1);
-+             }
-+             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-+@@ -1176,8 +1172,6 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+                         for (i = 0; i < (size * size); i++) {
-+                             coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+                         }
-+-                        printf("Cross component not supported\n"); // TODO
-+-                        exit(-1);
-+                         s->hevcdsp.transform_add[log2_trafo_size_c-2](dst, coeffs, stride);
-+                     }
-+             }
-+@@ -2844,7 +2838,13 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-+ #ifdef RPI
-+-    s->enable_rpi = 1; // TODO this should depend on cross component and frame width etc.
-++    s->enable_rpi = s->ps.sps->bit_depth == 8
-++                    && s->ps.sps->width <= RPI_MAX_WIDTH
-++                    && !s->ps.pps->cross_component_prediction_enabled_flag
-++                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
-++                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-++                    && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
-++
-+ #endif
-+ 
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index f28759b..ca76cb0 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1513,9 +1513,9 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+ #ifdef RPI
-+             if (!use_vpu) {
-+               int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+-              if (max_xy == 0)
-++              if (max_xy == 0) {
-+                   s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-+-              else {
-++              } else {
-+                   int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                   if (max_xy < 4)
-+                       col_limit = FFMIN(4, col_limit);
-+-- 
-+2.5.0
-+
-+
-+From 3b96ec07ff377691a80df9b15de202fcff660599 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 09:42:16 +0100
-+Subject: [PATCH 24/68] Split B prediction into two passes
-+
-+---
-+ libavcodec/hevc.c          |   1 +
-+ libavcodec/hevc.h          |   1 +
-+ libavcodec/rpi_qpu.c       |   3 +
-+ libavcodec/rpi_qpu.h       |   1 +
-+ libavcodec/rpi_shader.c    | 559 +++++++++++++++++++++++++++------------------
-+ libavcodec/rpi_shader.h    |  11 +-
-+ libavcodec/rpi_shader.qasm | 196 ++++++++++++++--
-+ 7 files changed, 531 insertions(+), 241 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index c59ee63..7e82602 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3729,6 +3729,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+             p += uv_commands_per_qpu;
-+         }
-+         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-++        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-+         s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
-+ 
-+     }
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index cae6659..3511982 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -943,6 +943,7 @@ typedef struct HEVCContext {
-+     uint32_t *u_mvs[8];
-+     // Function pointers
-+     uint32_t mc_filter_uv;
-++    uint32_t mc_filter_uv_b0;
-+     uint32_t mc_filter_uv_b;
-+ #endif
-+ 
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 4e90cc1..60bf079 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -636,6 +636,9 @@ unsigned int qpu_get_fn(int num) {
-+     case QPU_MC_FILTER_UV:
-+       fn = mc_filter_uv;
-+       break;
-++    case QPU_MC_FILTER_UV_B0:
-++      fn = mc_filter_uv_b0;
-++      break;
-+     case QPU_MC_FILTER_UV_B:
-+       fn = mc_filter_uv_b;
-+       break;
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index f9ad333..543c84b 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -29,6 +29,7 @@ enum {
-+   QPU_MC_FILTER_HONLY,
-+   QPU_MC_SETUP_UV,
-+   QPU_MC_FILTER_UV,
-++  QPU_MC_FILTER_UV_B0,
-+   QPU_MC_FILTER_UV_B,
-+   QPU_MC_INTERRUPT_EXIT8,
-+   QPU_MC_END
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 5d00cb2..88ad20b 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -39,18 +39,18 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+ /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+ /* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000088] */ 0x00000040, 0xe0021567, // mov rb21, 64
-+-/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+ /* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+ /* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+ /* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+@@ -62,176 +62,176 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+ /* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+ /* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000188] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x00000190] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x00000198] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x000001a0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001a8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001b0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001c0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001d0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x000001d8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x000001e0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x000001e8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000001f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000001f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000200] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000208] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000210] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000218] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000220] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000228] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000230] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-++/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-++/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-++/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000238] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000240] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000248] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000250] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000258] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000260] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000268] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000270] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000278] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000280] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000288] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000290] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000298] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002c0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000002c8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000002d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000002e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000002f0] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000360] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000368] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000370] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000378] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000380] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000388] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000430] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000438] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000440] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000448] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000450] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000458] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000460] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000468] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000470] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000478] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000480] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000488] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000490] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000498] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000004a0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000004a8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000004b0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x000004b8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004c0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000004c8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000004d0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000004d8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x000004e0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x000004e8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x000004f0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x000004f8] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000500] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000508] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000510] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000518] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000520] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000528] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000530] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000538] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000540] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000548] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000550] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000558] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000560] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000568] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000570] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000578] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000580] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000588] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000590] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000598] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-// ::mc_filter_uv_b
-+-/* [0x000005a0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005a8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005b0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005b8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005c0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000005c8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005d0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000005d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000005e0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000005e8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000005f0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000005f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000600] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000608] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000610] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000618] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000620] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000628] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000630] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000638] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000640] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000648] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000650] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000658] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000660] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000668] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000670] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_uv_b0
-++/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+ /* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+ /* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+@@ -253,7 +253,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+ /* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ /* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+-// :uvloop_b
-++// :uvloop_b0
-+ /* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+ /* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+ /* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+@@ -290,7 +290,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+ /* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+ /* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+ /* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+ /* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+ /* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+@@ -306,48 +306,163 @@ unsigned int rpi_shader[] = {
-+ /* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ /* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+ /* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x000008c0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x000008c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008d0] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x000008d8] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000008e0] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x000008e8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x000008f0] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x000008f8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000900] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000908] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000910] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000920] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000928] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000930] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000938] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_uv_b
-++/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :uvloop_b
-++/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-++/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-++/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000948] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000950] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000960] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000970] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000978] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000980] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000990] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000998] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000009e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000009f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000009f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index e36c4ae..809e582 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,10 +4,11 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 142)
-+-#define mc_filter_uv_b (rpi_shader + 360)
-+-#define mc_exit (rpi_shader + 592)
-+-#define mc_interrupt_exit8 (rpi_shader + 610)
-+-#define mc_end (rpi_shader + 640)
-++#define mc_filter_uv (rpi_shader + 150)
-++#define mc_filter_uv_b0 (rpi_shader + 368)
-++#define mc_filter_uv_b (rpi_shader + 586)
-++#define mc_exit (rpi_shader + 818)
-++#define mc_interrupt_exit8 (rpi_shader + 836)
-++#define mc_end (rpi_shader + 866)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 870437d2..635b894 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -26,7 +26,7 @@
-+ # ra23                                          8
-+ #
-+ # rb20                                          0xffffff00
-+-# rb21                                          64
-++# rb21                                          vpm_setup for writing 16bit results into VPM
-+ # rb22                                          255
-+ # rb23                                          24
-+ #
-+@@ -34,7 +34,7 @@
-+ # rb25                                          frame width-1
-+ # rb26                                          height<<23 + width<<16 + vdw_setup_0
-+ # rb27                                          vdw_setup_0 (depends on QPU number)
-+-# rb28                                          vpm_setup (depends on QPU number)
-++# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
-+ # rb29                                          vdw_setup_1(dst_pitch-width)
-+ # rb30                                          frame height-1
-+ # rb31                                          used as temp to count loop iterations
-+@@ -69,8 +69,6 @@
-+ .set ra_y_next,                    ra28
-+ .set ra_y,                         ra29
-+ 
-+-.set rb_const_64,                  rb21
-+-
-+ 
-+ ################################################################################
-+ # mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-+@@ -106,7 +104,6 @@ mov ra22, 256
-+ mov ra23, 8
-+ 
-+ mov rb20, 0xffffff00
-+-mov rb21, 64
-+ mov rb22, 255
-+ mov rb23, 24
-+ 
-+@@ -123,6 +120,7 @@ mov ra15, 0
-+ 
-+ # Compute part of VPM to use for DMA output
-+ mov r2, qpu_num
-++shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+ and r2, r2, 15
-+ mov r1, r2
-+ asr r1, r1, 2
-+@@ -135,16 +133,21 @@ shl r0, r0, 5
-+ add rb27, r0, r1
-+ 
-+ # Compute part of VPM to save data into
-+-mov r2, qpu_num
-+-and r2, r2, 15
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vpm_setup(0, 4, h8p(0, 0))
-++mov r2, qpu_num   # qpu_num = abcd
-++shl r2, r2, 1
-++and r2, r2, 15    # r2 = bcd0
-++mov r1, r2        # r1 = bcd0
-++asr r1, r1, 2     # r1 = bc
-++shl r1, r1, 6     # r1 = bc000000
-++mov r0, r2        # r0 = bcd0
-++and r0, r0, 3     # r0 = d0
-++add r0, r0, r1    # r0 = bc0000d0
-++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+ add rb28, r0, r1
-++asr r0, r0, 1     # r0 = bc0000d
-++# Prepare VPM command for 16bit intermediates
-++mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-++add rb21, r0, r1
-+ 
-+ # Compute base address for first and second access
-+ mov r0, ra_x_base           # Load x
-+@@ -345,6 +348,171 @@ mov vw_addr, unif # start the VDW
-+ 
-+ ################################################################################
-+ 
-++# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-++
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x_base, ra_x16_base point to the current coordinates for this block
-++::mc_filter_uv_b0
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num    # x
-++max r0, r0, 0; mov r1, unif # y
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-++shl ra_xshift_next, r0, 3
-++sub r2, unif, r3 # compute offset from frame base u to frame base v
-++add r0, r0, r3
-++and rb_x_base_next, r0, ~3
-++mov ra_y_next, r1
-++add ra_x2_base_next, rb_x_base_next, r2
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-++
-++# get filter coefficients
-++
-++mov r0, unif
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++asr rb12, r0, rb23
-++
-++# r2 is elem_num
-++# r3 is loop counter
-++
-++mov r5rep, -8
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:uvloop_b0
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_x2_base, r2
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++nop                  ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++add r0, r2, r3
-++
-++mov r3, rb31
-++
-++mov ra8, ra9
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++mov ra13, ra14
-++
-++sub.setf -, r3, 8 ; mov r1, ra22
-++
-++# apply horizontal filter
-++brr.anyn -, r:uvloop_b0
-++mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-++asr ra15, r0, 8         ; nop
-++nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r1, ra14, rb14
-++nop                     ; mul24 r0, ra13, rb13
-++add r1, r1, r0          ; mul24 r0, ra12, rb12
-++add r1, r1, r0          ; mul24 r0, ra11, rb11
-++add r1, r1, r0          ; mul24 r0, ra10, rb10
-++add r1, r1, r0          ; mul24 r0, ra9, rb9
-++add r1, r1, r0          ; mul24 r0, ra8, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb15
-++add r1, r1, r0          ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 14
-++add r1, r1, ra21
-++brr.anyn -, r:uvloop
-++asr r1, r1, 6          # Delay 1
-++min r1, r1, rb22       # Delay 2
-++max vpm, r1, 0         # Delay 3
-++
-++# DMA out for U
-++
-++mov vw_setup, rb26 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++# DMA out for V
-++# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-++# Could potentially push this write into the start of the next pipeline stage.
-++mov r0, 16
-++mov -, vw_wait
-++
-++bra -, ra31
-++add vw_setup, rb26, r0 # VDW setup 0
-++mov vw_setup, rb29 # Stride
-++mov vw_addr, unif # start the VDW
-++
-++################################################################################
-++
-+ ::mc_filter_uv_b
-+ mov ra31, unif
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 6ddd4f127ca17be70a2e60a7b2ff127de89b559c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 10:04:55 +0100
-+Subject: [PATCH 25/68] Switch to using 16bit temp buffers
-+
-+---
-+ libavcodec/hevc.c          |  2 +-
-+ libavcodec/rpi_shader.c    |  4 ++--
-+ libavcodec/rpi_shader.qasm | 10 +++++-----
-+ 3 files changed, 8 insertions(+), 8 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 7e82602..753f85c 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2134,7 +2134,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 88ad20b..ffd3a07 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -220,7 +220,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+ /* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+ /* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000618] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+ /* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+ /* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+@@ -346,7 +346,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+ /* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+ /* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000009f8] */ 0x0c9dc7c0, 0x10020c67, // add vr_setup, r3, rb28
-++/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+ /* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+ /* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 635b894..9577121 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -26,7 +26,7 @@
-+ # ra23                                          8
-+ #
-+ # rb20                                          0xffffff00
-+-# rb21                                          vpm_setup for writing 16bit results into VPM
-++# rb21                                          vpm_setup for reading/writing 16bit results into VPM
-+ # rb22                                          255
-+ # rb23                                          24
-+ #
-+@@ -370,8 +370,8 @@ and rb_x_base_next, r0, ~3
-+ mov ra_y_next, r1
-+ add ra_x2_base_next, rb_x_base_next, r2
-+ 
-+-# set up VPM write
-+-mov vw_setup, rb28
-++# set up VPM write, we need to save 16bit precision
-++mov vw_setup, rb21
-+ 
-+ # get width,height of block
-+ mov r2, 16
-+@@ -554,8 +554,8 @@ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb26, r0, rb27
-+ 
-+-# In a B frame, so also set up VPM read
-+-add vr_setup, r3, rb28
-++# In a B frame, so also set up VPM read (reading back 16bit precision)
-++add vr_setup, r3, rb21
-+ 
-+ sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+ 
-+-- 
-+2.5.0
-+
-+
-+From b516e30ff4a9354497d3b6ecee77bfaeb69ca4d6 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 10:30:44 +0100
-+Subject: [PATCH 26/68] Corrected B prediction: matching md5 sum for hobbit50
-+
-+---
-+ libavcodec/rpi_shader.c    | 815 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  12 +-
-+ libavcodec/rpi_shader.qasm |  36 +-
-+ 3 files changed, 429 insertions(+), 434 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index ffd3a07..77cca46 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -38,431 +38,428 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+ /* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+ /* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000d8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000000e0] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x000000e8] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000000f0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000000f8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000108] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000110] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000118] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000140] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x00000148] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000150] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000158] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000160] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000168] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000170] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000178] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000180] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000188] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000190] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+-/* [0x00000198] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+-/* [0x000001a0] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+-/* [0x000001a8] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x000001b0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000001b8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x000001c0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001c8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001d8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001e0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x000001e8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001f0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x000001f8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x00000200] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000208] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-++/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-++/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-++/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-++/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-++/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+ /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000220] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000228] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000230] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000238] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000240] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000248] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000250] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000258] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000260] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000268] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000270] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000278] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000280] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000288] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000290] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000298] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000002a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002b0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002e0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000002e8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000002f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000300] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000308] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000310] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000340] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000360] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000368] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000370] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000378] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000380] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000388] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000390] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000398] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x000003a0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000003a8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000450] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000458] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000460] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000468] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000470] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000478] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000480] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000488] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000490] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000498] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000004a0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x000004a8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000004b0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000004b8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000004c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000004c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000004d0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x000004d8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000004e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000004f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000004f8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000500] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000508] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000510] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000518] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000520] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000528] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000530] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000538] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000540] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000548] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000550] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000558] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000560] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000568] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000570] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000578] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000580] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000588] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000590] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000598] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000005a0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000005a8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000005b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000005b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000005c0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005c8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005d0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005d8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005e0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000005e8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005f0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000005f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000600] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000608] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000610] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000618] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000620] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000628] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000630] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000638] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000640] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000648] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000650] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000658] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000660] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000668] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000670] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000678] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000680] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000688] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000690] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000698] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000006a8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000006c8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006d0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006d8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006e0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000006e8] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f0] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f8] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000700] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000708] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000710] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000718] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000720] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000728] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000730] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000738] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000740] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000748] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000750] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000758] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000760] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000768] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000770] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000778] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000780] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000788] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000790] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000798] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000007a0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007a8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000007b0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007b8] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000007c0] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007c8] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000007d0] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007d8] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000007e0] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000007e8] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000007f0] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x000007f8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000800] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000808] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000810] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000818] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000820] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000828] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000830] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000838] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000840] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000848] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000850] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000858] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000860] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000868] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000870] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000878] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000880] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000888] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000890] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000898] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000008a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000008b8] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x000008c0] */ 0xfffffad8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000008c8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x000008d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000008d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000008e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000008f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000008f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000900] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000908] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000910] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000918] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000920] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-++/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00000928] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000930] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000938] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000940] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000948] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000950] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000958] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000960] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000968] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000970] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000978] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000980] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000988] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000998] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000009a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000009a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000009b0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000009b8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000009c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000009c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x000009d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x000009d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x000009e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000009e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000009f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000009f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000a00] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000a08] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000a10] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a18] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a20] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a28] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000a30] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a38] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a40] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a48] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000a50] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a58] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a60] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a68] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000a70] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a78] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a80] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a88] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000a90] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000a98] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000aa0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000aa8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000ab0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000ab8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000ac0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000ac8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000ad0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000ad8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000ae0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000ae8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000af0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000af8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000b00] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000b08] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000b10] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000b18] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000b20] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000b28] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000b30] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000b38] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000b40] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000b48] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000b50] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000b58] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000b60] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000b68] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000b70] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000b78] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000b80] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000b88] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000b90] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000b98] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000ba0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000ba8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000bb0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000bb8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000bc0] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000bc8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000bd0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000bd8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000be0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000be8] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000bf0] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000bf8] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000c00] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000c08] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000c10] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000c18] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000c20] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000c28] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000c30] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000c38] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000c40] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000c48] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000c50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000c58] */ 0x0cc01dc0, 0xd0020827, // add r0, vpm, 1
-+-/* [0x00000c60] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000c68] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000c70] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00000c78] */ 0x0e9c13c0, 0xd0020c27, // shr vpm, r1, 1
-+-/* [0x00000c80] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000c88] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000c90] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000c98] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000ca0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000ca8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000cb0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000cb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000cc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000cc8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000cd0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ce0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ce8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cf0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cf8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000d00] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000d08] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000d10] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d70] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000d78] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000d80] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 809e582..6562fa9 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,11 +4,11 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 150)
-+-#define mc_filter_uv_b0 (rpi_shader + 368)
-+-#define mc_filter_uv_b (rpi_shader + 586)
-+-#define mc_exit (rpi_shader + 818)
-+-#define mc_interrupt_exit8 (rpi_shader + 836)
-+-#define mc_end (rpi_shader + 866)
-++#define mc_filter_uv (rpi_shader + 152)
-++#define mc_filter_uv_b0 (rpi_shader + 370)
-++#define mc_filter_uv_b (rpi_shader + 584)
-++#define mc_exit (rpi_shader + 812)
-++#define mc_interrupt_exit8 (rpi_shader + 830)
-++#define mc_end (rpi_shader + 860)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 9577121..562dc35 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -39,13 +39,13 @@
-+ # rb30                                          frame height-1
-+ # rb31                                          used as temp to count loop iterations
-+ #
-+-# ra24...ra30                                   15, 14, 13, 12, 11, 10, 9
-+ # ra24                                          clipped(row start address+8+elem_num)&~3
-+ # ra25                                          per-channel shifts 2
-+ # ra26                                          next ra24
-+ # ra27                                          next ra25
-+ # ra28                                          next y
-+ # ra29                                          y for next texture access
-++# ra30                                          64
-+ #
-+ # ra31                                          next kernel address
-+ 
-+@@ -102,6 +102,7 @@ mov ra20, 1
-+ mov ra21, 32
-+ mov ra22, 256
-+ mov ra23, 8
-++mov ra30, 64
-+ 
-+ mov rb20, 0xffffff00
-+ mov rb22, 255
-+@@ -472,7 +473,7 @@ sub.setf -, r3, 8 ; mov r1, ra22
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b0
-+ mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-+-asr ra15, r0, 8         ; nop
-++asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
-+ nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-+ 
-+ # apply vertical filter and write to VPM
-+@@ -487,18 +488,18 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
-+ add r1, r1, r0          ; mul24 r0, ra15, rb15
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 14
-+-add r1, r1, ra21
-+-brr.anyn -, r:uvloop
-+-asr r1, r1, 6          # Delay 1
-+-min r1, r1, rb22       # Delay 2
-+-max vpm, r1, 0         # Delay 3
-++#asr r1, r1, 14
-++#add r1, r1, ra21
-++brr.anyn -, r:uvloop_b0
-++asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
-++nop                    # Delay 2
-++nop                    # Delay 3
-+ 
-+ # DMA out for U
-+ 
-+ mov vw_setup, rb26 # VDW setup 0
-+ mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW
-++mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
-+ 
-+ # DMA out for V
-+ # We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+@@ -639,12 +640,11 @@ mov ra12, ra13
-+ mov ra13, ra14
-+ 
-+ sub.setf -, r3, 8 ; mov r1, ra22
-+-
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b
-+ mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+ asr ra15, r0, 8         ; nop
-+-nop                     ; nop
-++nop                     ; nop    # TODO improve use of delay slots
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+@@ -658,15 +658,13 @@ add r1, r1, r0          ; mul24 r0, ra8, rb8
-+ add r1, r1, r0          ; mul24 r0, ra15, rb15
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 14
-+-add r1, r1, ra21
-+-asr r1, r1, 6
-+-min r1, r1, rb22
-+-add r0, vpm, 1          # Blend in previous VPM contents at this location
-++asr r1, r1, 14          # shift2=6
-++add r1, r1, vpm         # Blend in previous VPM contents at this location
-++add r1, r1, ra30
-+ brr.anyn -, r:uvloop_b
-+-max r1, r1, 0
-+-add r1, r1, r0
-+-shr vpm, r1, 1
-++asr r1, r1, 7           # Delay 1
-++min r1, r1, rb22        # Delay 2
-++max vpm, r1, 0          # Delay 3
-+ 
-+ 
-+ # DMA out for U
-+-- 
-+2.5.0
-+
-+
-+From 5a589f03af71ff87e50d46520ed652571357c9cc Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 10:55:07 +0100
-+Subject: [PATCH 27/68] P prediction uses 4 tap filters
-+
-+---
-+ libavcodec/hevc.c          |  50 ++--
-+ libavcodec/rpi_shader.c    | 631 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  10 +-
-+ libavcodec/rpi_shader.qasm |  43 +--
-+ 4 files changed, 344 insertions(+), 390 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 753f85c..16f2200 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -63,15 +63,15 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+ // TODO Chroma only needs 4 taps
-+-static uint32_t rpi_filter_coefs[8][2] = {
-+-        { ENCODE_COEFFS(  0,  0,  0,  64), ENCODE_COEFFS(   0,   0,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -2,  58), ENCODE_COEFFS(  10,  -2,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -4,  54), ENCODE_COEFFS(  16,  -2,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -6,  46), ENCODE_COEFFS(  28,  -4,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -4,  36), ENCODE_COEFFS(  36,  -4,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -4,  28), ENCODE_COEFFS(  46,  -6,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -2,  16), ENCODE_COEFFS(  54,  -4,  0,  0 ) },
-+-        { ENCODE_COEFFS(  0,  0, -2,  10), ENCODE_COEFFS(  58,  -2,  0,  0 ) }
-++static uint32_t rpi_filter_coefs[8][1] = {
-++        { ENCODE_COEFFS(   0,  64,   0,   0) },
-++        { ENCODE_COEFFS(  -2,  58,  10,  -2) },
-++        { ENCODE_COEFFS(  -4,  54,  16,  -2) },
-++        { ENCODE_COEFFS(  -6,  46,  28,  -4) },
-++        { ENCODE_COEFFS(  -4,  36,  36,  -4) },
-++        { ENCODE_COEFFS(  -4,  28,  46,  -6) },
-++        { ENCODE_COEFFS(  -2,  16,  54,  -4) },
-++        { ENCODE_COEFFS(  -2,  10,  58,  -2) }
-+ };
-+ 
-+ static uint32_t get_vc_address(AVBufferRef *bref) {
-+@@ -2014,16 +2014,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      *u++ = rpi_filter_coefs[_mx][1];
-++                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      *u++ = rpi_filter_coefs[_my][1];
-++                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2071,16 +2071,16 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      *u++ = rpi_filter_coefs[_mx][1];
-++                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      *u++ = rpi_filter_coefs[_my][1];
-++                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2135,29 +2135,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 3 + start_x;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      *u++ = rpi_filter_coefs[_mx][1];
-++                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      *u++ = rpi_filter_coefs[_my][1];
-++                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+ 
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 3 + start_x;
-+-                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 3 + start_y;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-++                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx2][0];
-+-                      *u++ = rpi_filter_coefs[_mx2][1];
-++                      u++;
-+                       *u++ = rpi_filter_coefs[_my2][0];
-+-                      *u++ = rpi_filter_coefs[_my2][1];
-++                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 77cca46..c8d0728 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -116,8 +116,8 @@ unsigned int rpi_shader[] = {
-+ /* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+ /* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+ /* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002e8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000002f0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+ /* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+ /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+ /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+@@ -128,338 +128,315 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ /* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ /* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000348] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000360] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000368] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000370] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000378] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000380] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000388] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000390] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000398] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000003a0] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003b8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003c8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003e0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000400] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000408] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000410] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000418] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000420] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000428] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000430] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000438] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000440] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000448] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000450] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000458] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000460] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000468] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000470] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000478] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000480] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000488] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000490] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000498] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x000004a0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000004a8] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x000004b0] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x000004b8] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x000004c0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x000004c8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000004d0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000004d8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x000004e0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004e8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000004f0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000004f8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000500] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000508] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000510] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000518] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000520] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000528] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000530] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000538] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000540] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000548] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000550] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000558] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000560] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000568] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000570] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000578] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000580] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000588] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000590] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000598] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000005a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000005a8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000005b0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000005b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000005c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000005c8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000005d0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000005d8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000005e0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000005e8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000005f0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000005f8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000600] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000608] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000610] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000618] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000620] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000628] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000630] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000638] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000640] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000648] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000650] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000658] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000660] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000668] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000670] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000678] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000680] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000688] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000690] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000698] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000006b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000006d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000006e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000006f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000006f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000700] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000708] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000710] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000720] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000728] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000730] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000738] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000740] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000748] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000750] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000758] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000760] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000768] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000770] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000778] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000780] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000788] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000790] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000798] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000007a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000007a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000007b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000007b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000007c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000007c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000007d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000007d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000007e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000007e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000007f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000007f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000800] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000808] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000810] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000818] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000820] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000828] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000830] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000838] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000840] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000848] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000850] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000858] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000860] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000868] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000870] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000878] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000880] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000888] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000890] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000898] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x000008a0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000008a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000008b0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000008b8] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000008c0] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+-/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x000008d8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000008e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000008e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000008f0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000008f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000900] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000908] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000910] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000918] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-++/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00000920] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000928] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000930] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000938] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000940] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000948] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000950] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000958] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000960] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000968] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000970] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000978] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000980] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000988] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000990] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000998] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000009a0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000009a8] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000009b0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x000009b8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000009c0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x000009c8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x000009d0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x000009d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000009e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000009e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000009f0] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x000009f8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000a00] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000a08] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a10] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a18] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a20] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000a28] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a30] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a38] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a40] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000a48] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a50] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a58] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000a60] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000a68] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a70] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a78] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000a80] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000a88] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000a90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000a98] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000aa0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000aa8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000ab0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000ab8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000ac0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000ac8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000ad0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000ad8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000ae0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000ae8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000af0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000af8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000b00] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000b08] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000b10] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000b18] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000b20] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000b28] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000b30] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000b38] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000b40] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000b48] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000b50] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000b58] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000b60] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000b68] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000b70] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000b78] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000b80] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000b88] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000b90] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000b98] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000ba0] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000ba8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000bb0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000bb8] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000bc0] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000bc8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000bd0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000be0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000be8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000bf0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000bf8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000c00] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000c08] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000c10] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000c18] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000c20] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000c28] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000c30] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000c38] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000c40] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000c48] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000c50] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000c58] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000c60] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000c68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000c70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000c78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000c80] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000c88] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000c90] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000c98] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000ca0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000ca8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000cb0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000cb8] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000cc0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cc8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cd0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000cd8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ce0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ce8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000cf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000d00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000d20] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d28] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d30] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d38] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d40] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000d58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000d60] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000d68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 6562fa9..1bf7a68 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 370)
-+-#define mc_filter_uv_b (rpi_shader + 584)
-+-#define mc_exit (rpi_shader + 812)
-+-#define mc_interrupt_exit8 (rpi_shader + 830)
-+-#define mc_end (rpi_shader + 860)
-++#define mc_filter_uv_b0 (rpi_shader + 324)
-++#define mc_filter_uv_b (rpi_shader + 538)
-++#define mc_exit (rpi_shader + 766)
-++#define mc_interrupt_exit8 (rpi_shader + 784)
-++#define mc_end (rpi_shader + 814)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 562dc35..8e4f18f 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -16,8 +16,8 @@
-+ # ra19                                          next ra17
-+ #
-+ # rb16                                          pitch
-+-# rb17                                          height + 5
-+-# rb18                                          height + 7
-++# rb17                                          height + 1
-++# rb18                                          height + 3
-+ # rb19                                          next ra16
-+ #
-+ # ra20                                          1
-+@@ -214,8 +214,8 @@ mov r0, unif
-+ shr r1, r0, r2 # Extract width
-+ sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+ and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-++add rb17, r0, 1
-++add rb18, r0, 3
-+ shl r0, r0, 7
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+@@ -230,18 +230,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-++                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb12, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -283,26 +276,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+ add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+ mov ra12, ra13
-+ mov ra13, ra14
-+ 
-+-sub.setf -, r3, 8 ; mov r1, ra22
-++sub.setf -, r3, 4 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop
-+@@ -312,14 +293,10 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r1, ra14, rb14
-+-nop                     ; mul24 r0, ra13, rb13
-+-add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb15
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ asr r1, r1, 14
-+-- 
-+2.5.0
-+
-+
-+From b267b33e74268586aacdcc31ca02c35aba69a230 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:03:51 +0100
-+Subject: [PATCH 28/68] Optimised B0 pass
-+
-+---
-+ libavcodec/rpi_shader.c    | 424 +++++++++++++++++++++------------------------
-+ libavcodec/rpi_shader.h    |   8 +-
-+ libavcodec/rpi_shader.qasm |  43 +----
-+ 3 files changed, 212 insertions(+), 263 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index c8d0728..1f63ee0 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -204,239 +204,215 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+ /* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+ /* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000598] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000005a0] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+ /* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+ /* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+ /* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+ /* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005c8] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005d8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005e0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005e8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005f0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005f8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000600] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000608] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000610] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000630] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000638] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000640] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000648] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000650] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000708] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000710] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000718] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000720] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000728] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000730] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000738] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000740] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000748] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000750] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000758] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000760] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000768] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000770] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000778] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000780] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000788] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000790] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000798] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000007a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000007a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000007b0] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x000007b8] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x000007c0] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x000007c8] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x000007d0] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x000007d8] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x000007e0] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x000007e8] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000007f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000800] */ 0xfffffe50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000808] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+-/* [0x00000810] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000818] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000820] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000828] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000830] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000838] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000848] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000850] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000858] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000860] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-++/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00000868] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000870] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000878] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000880] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000888] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000890] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000898] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000008a0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000008a8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000008b0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000008b8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000008c0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000008c8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008d8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000008e0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000008e8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000008f0] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x000008f8] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000900] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000908] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000910] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000918] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000920] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000928] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000930] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000938] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000940] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000948] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000950] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000958] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000960] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000968] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000970] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000978] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000980] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000988] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000990] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000998] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000009a0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000009a8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000009b0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000009b8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000009c0] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000009c8] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x000009d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000009d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000009e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-++/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-++/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000009e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000009f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000009f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000a00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000a08] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000a10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000a18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000a20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000a28] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000a30] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000a38] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000a40] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000a48] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000a50] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000a58] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000a60] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000a68] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000a70] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000a78] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000a80] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00000a88] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00000a90] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00000a98] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00000aa0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00000aa8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00000ab0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00000ab8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000ac0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000ac8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000ad0] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000ad8] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000ae0] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000ae8] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000af0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000af8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000b00] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000b08] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000b10] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000b18] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000b28] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000b30] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000b38] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000b40] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000b48] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000b50] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000b58] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000b60] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000b68] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000b70] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000b78] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000b80] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000b88] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000b90] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000b98] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000ba0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000ba8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000bb0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000bb8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000bc0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000bc8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000bd0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000bd8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000be0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000be8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000bf0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-++/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-++/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-++/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-++/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-++/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-++/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-++/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-++/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-++/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-++/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000bf8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000c00] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000c08] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c28] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000c30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000c38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000c40] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000c48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000c68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000c98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ca0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ca8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000cb0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 1bf7a68..cb74887 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+ #define mc_filter_uv_b0 (rpi_shader + 324)
-+-#define mc_filter_uv_b (rpi_shader + 538)
-+-#define mc_exit (rpi_shader + 766)
-+-#define mc_interrupt_exit8 (rpi_shader + 784)
-+-#define mc_end (rpi_shader + 814)
-++#define mc_filter_uv_b (rpi_shader + 490)
-++#define mc_exit (rpi_shader + 718)
-++#define mc_interrupt_exit8 (rpi_shader + 736)
-++#define mc_end (rpi_shader + 766)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 8e4f18f..faa5755 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -357,15 +357,13 @@ mov r0, unif
-+ shr r1, r0, r2 # Extract width
-+ sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+ and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-++add rb17, r0, 1
-++add rb18, r0, 3
-+ shl r0, r0, 7
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb26, r0, rb27
-+ 
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+ # get filter coefficients
-+ 
-+ mov r0, unif
-+@@ -373,18 +371,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-++                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb12, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -426,26 +417,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+ add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+ mov ra12, ra13
-+ mov ra13, ra14
-+ 
-+-sub.setf -, r3, 8 ; mov r1, ra22
-++sub.setf -, r3, 4 ; mov r1, ra22
-+ 
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b0
-+@@ -455,18 +434,12 @@ nop                     ; nop  # Delay slot 3 (TODO move more of the context scr
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r1, ra14, rb14
-+-nop                     ; mul24 r0, ra13, rb13
-+-add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb15
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-#asr r1, r1, 14
-+-#add r1, r1, ra21
-+ brr.anyn -, r:uvloop_b0
-+ asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
-+ nop                    # Delay 2
-+-- 
-+2.5.0
-+
-+
-+From 7941c95bd5e968d6e1ea0462cb27c475aa4ee5e1 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:12:43 +0100
-+Subject: [PATCH 29/68] Optimised B pass
-+
-+---
-+ libavcodec/rpi_shader.c    | 202 ++++++++++++++++++++-------------------------
-+ libavcodec/rpi_shader.h    |   6 +-
-+ libavcodec/rpi_shader.qasm |  41 ++-------
-+ 3 files changed, 100 insertions(+), 149 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 1f63ee0..4e6c5ea 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -289,8 +289,8 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+ /* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+ /* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000830] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000838] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+ /* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+ /* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+ /* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+@@ -299,120 +299,96 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+ /* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+ /* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000880] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000888] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000890] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000898] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000008b0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008b8] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008c8] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000008d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000008f0] */ 0x4f5971c6, 0x100253e0, // asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008f8] */ 0x4f5971c6, 0x100253a0, // asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000900] */ 0x4f5971c6, 0x10025360, // asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000908] */ 0x0f9d71c0, 0x10021327, // asr rb12, r0, rb23
-+-/* [0x00000910] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000918] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000920] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000928] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000930] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000938] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000940] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000948] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000950] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000958] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000960] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000968] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000970] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000978] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000980] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000988] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000990] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000998] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000009a0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000009a8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000009b0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000009b8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000009c0] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x000009c8] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x000009d0] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x000009d8] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x000009e0] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x000009e8] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x000009f0] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x000009f8] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00000a00] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000a08] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000a10] */ 0x15267d80, 0x10020227, // mov ra8, ra9
-+-/* [0x00000a18] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00000a20] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00000a28] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00000a30] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000a38] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000a40] */ 0x8d5887f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra22
-+-/* [0x00000a48] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a50] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000a58] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000a68] */ 0x4038e037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb14
-+-/* [0x00000a70] */ 0x4034d037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb13
-+-/* [0x00000a78] */ 0x4c30c237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-/* [0x00000a80] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-/* [0x00000a88] */ 0x4c28a237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-/* [0x00000a90] */ 0x4c249237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-/* [0x00000a98] */ 0x4c208237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-/* [0x00000aa0] */ 0x4c3cf237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb15
-+-/* [0x00000aa8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000ab0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000ab8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000ac0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000ac8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000ad0] */ 0xfffffe38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000ad8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000ae0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000ae8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000af0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000af8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000b00] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000b08] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000b18] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000b20] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000b28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000b30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000b40] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000b48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b68] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b70] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000b78] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000b80] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000b88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000b98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ba0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ba8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bb0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bb8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bc0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bc8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bd0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000bd8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000be0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000be8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000bf0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index cb74887..53da629 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -7,8 +7,8 @@ extern unsigned int rpi_shader[];
-+ #define mc_filter_uv (rpi_shader + 152)
-+ #define mc_filter_uv_b0 (rpi_shader + 324)
-+ #define mc_filter_uv_b (rpi_shader + 490)
-+-#define mc_exit (rpi_shader + 718)
-+-#define mc_interrupt_exit8 (rpi_shader + 736)
-+-#define mc_end (rpi_shader + 766)
-++#define mc_exit (rpi_shader + 670)
-++#define mc_interrupt_exit8 (rpi_shader + 688)
-++#define mc_end (rpi_shader + 718)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index faa5755..f38c926 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -491,8 +491,8 @@ mov r0, unif
-+ shr r1, r0, r2 # Extract width
-+ sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+ and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-++add rb17, r0, 1
-++add rb18, r0, 3
-+ shl r0, r0, 7
-+ 
-+ # r0 is currently height<<7
-+@@ -508,8 +508,6 @@ add rb26, r0, rb27
-+ # In a B frame, so also set up VPM read (reading back 16bit precision)
-+ add vr_setup, r3, rb21
-+ 
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+ # get filter coefficients
-+ 
-+ mov r0, unif
-+@@ -517,18 +515,11 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-++                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23;      mov r0, unif
-+-asr rb15, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb14, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb13, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb12, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -570,26 +561,14 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+ add r0, r2, r3
-+ 
-+ mov r3, rb31
-+ 
-+-mov ra8, ra9
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+ mov ra12, ra13
-+ mov ra13, ra14
-+ 
-+-sub.setf -, r3, 8 ; mov r1, ra22
-++sub.setf -, r3, 4 ; mov r1, ra22
-+ # apply horizontal filter
-+ brr.anyn -, r:uvloop_b
-+ mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+@@ -598,14 +577,10 @@ nop                     ; nop    # TODO improve use of delay slots
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r1, ra14, rb14
-+-nop                     ; mul24 r0, ra13, rb13
-+-add r1, r1, r0          ; mul24 r0, ra12, rb12
-+-add r1, r1, r0          ; mul24 r0, ra11, rb11
-+-add r1, r1, r0          ; mul24 r0, ra10, rb10
-+-add r1, r1, r0          ; mul24 r0, ra9, rb9
-+-add r1, r1, r0          ; mul24 r0, ra8, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb15
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ asr r1, r1, 14          # shift2=6
-+-- 
-+2.5.0
-+
-+
-+From 3c9637fbe9311db205c5e3a1ab178771dab22856 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:17:09 +0100
-+Subject: [PATCH 30/68] Used P delay slots more efficiently
-+
-+---
-+ libavcodec/rpi_shader.c    | 437 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  10 +-
-+ libavcodec/rpi_shader.qasm |  19 +-
-+ 3 files changed, 228 insertions(+), 238 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 4e6c5ea..a1af4e3 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -156,239 +156,236 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ /* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ /* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000420] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000428] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000430] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+ /* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000440] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x00000448] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000450] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000458] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000460] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000468] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000470] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000478] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000480] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000488] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000490] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000498] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000004a0] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x000004a8] */ 0xfffffec0, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004b0] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x000004b8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004c0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000004c8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000004e0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000004e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000004f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000004f8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000500] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000508] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x00000510] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000518] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000520] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000528] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000530] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000538] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000540] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000548] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000550] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000558] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000560] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000568] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000570] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000578] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000580] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000588] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000590] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000598] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000005a0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000005a8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000005b0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000005b8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005c0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005d0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005d8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005e0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005e8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005f8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000600] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000608] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000610] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000618] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000620] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000628] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000630] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000638] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000640] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000648] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000650] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000658] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000660] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000668] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000670] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000678] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000680] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000688] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000690] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000698] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006a0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006a8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000006b0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006b8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000006c0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000006c8] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x000006d0] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000006d8] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000006e0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000006e8] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x000006f0] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006f8] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x00000700] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x00000708] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x00000710] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000718] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000720] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000728] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000730] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000738] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000740] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000748] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+-/* [0x00000750] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000758] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000760] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000768] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000770] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000778] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000780] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000790] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000798] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000007a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-++/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000008a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008b0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008b8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008c0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008c8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000008d0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008e0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008e8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008f0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000008f8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000900] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000908] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000910] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000918] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000920] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000928] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000930] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000938] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000940] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000948] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000950] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000958] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000960] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000968] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000970] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000978] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000980] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000988] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000990] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000998] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009a0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x000009a8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009b0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000009b8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000009c0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000a10] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-++/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-++/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-++/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-++/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-++/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-++/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 53da629..1fb3e37 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 324)
-+-#define mc_filter_uv_b (rpi_shader + 490)
-+-#define mc_exit (rpi_shader + 670)
-+-#define mc_interrupt_exit8 (rpi_shader + 688)
-+-#define mc_end (rpi_shader + 718)
-++#define mc_filter_uv_b0 (rpi_shader + 318)
-++#define mc_filter_uv_b (rpi_shader + 484)
-++#define mc_exit (rpi_shader + 664)
-++#define mc_interrupt_exit8 (rpi_shader + 682)
-++#define mc_end (rpi_shader + 712)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index f38c926..02e95dd 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -268,6 +268,7 @@ add t0s, ra_x2_base, r2
-+ 
-+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-++# apply horizontal filter
-+ nop                  ; mul24 r2, r0, ra0
-+ nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+ nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+@@ -276,20 +277,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 4 ; mov r1, ra22
-+-
-+-# apply horizontal filter
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 4    ; mov ra12, ra13
-+ brr.anyn -, r:uvloop
-+-mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-+-asr ra15, r0, 8         ; nop
-+-nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 07f733af90de9d5823f62c0b7276bb1c7187ec6f Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:22:25 +0100
-+Subject: [PATCH 31/68] Improved use of delay slots
-+
-+---
-+ libavcodec/rpi_shader.c    | 503 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  10 +-
-+ libavcodec/rpi_shader.qasm |  41 ++--
-+ 3 files changed, 265 insertions(+), 289 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index a1af4e3..c498f28 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -122,270 +122,263 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+ /* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+ /* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000318] */ 0x0d9c8e40, 0xd00229e7, // sub.setf -,8,r1
-+-/* [0x00000320] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000328] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000340] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000350] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000360] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000368] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000370] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000378] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000380] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000388] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000390] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000398] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000003f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000003f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000400] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000408] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000410] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000418] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000420] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000428] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000430] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000438] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000440] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000448] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000450] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000458] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000460] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000468] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000470] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000478] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000480] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000488] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000490] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000498] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x000004a0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004a8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000004b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000004b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000004c8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000004d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000004d8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000004e0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000004e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000004f8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000500] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000508] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000510] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000518] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000520] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000528] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000530] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000538] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000540] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000548] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000550] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000558] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000560] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000568] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000570] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000578] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000580] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000588] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000590] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000598] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000005a0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005a8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005b8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005c0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005c8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005d0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000600] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000608] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000610] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000618] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000620] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000628] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000630] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000638] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000640] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000648] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000650] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000658] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000660] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000668] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000670] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000678] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000680] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000688] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000690] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000698] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006a0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000006a8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000006b0] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x000006b8] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x000006c0] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x000006c8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000006d0] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x000006d8] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006e0] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000006e8] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000006f0] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000006f8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000700] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000708] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000710] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000718] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000720] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000728] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000730] */ 0x0f9ce3c0, 0xd0020c27, // asr vpm, r1, 14
-+-/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000748] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000750] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000758] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000760] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000770] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000778] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000780] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000788] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b
-+-/* [0x00000790] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000798] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007a0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007a8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007b0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007b8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007c0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007d0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000007d8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007e0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000007e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000007f0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000007f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000800] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000808] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000810] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000818] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000820] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000828] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000830] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000838] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000840] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000848] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000850] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000858] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000860] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000870] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000878] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000880] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000888] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008b0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000008b8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008c8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008d0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008d8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000008e0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000008e8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000008f0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000008f8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000900] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000908] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000910] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000918] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000920] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000928] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000930] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000938] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000940] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000948] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000950] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000958] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000960] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000968] */ 0x0c9e74c0, 0x10020827, // add r0, r2, r3
-+-/* [0x00000970] */ 0x159dffc0, 0x100208e7, // mov r3, rb31
-+-/* [0x00000978] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00000980] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000988] */ 0x8d5847f6, 0xd00269e1, // sub.setf -, r3, 4 ; mov r1, ra22
-+-/* [0x00000990] */ 0xffffff20, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000998] */ 0x553e7d81, 0x100243a0, // mov ra14, ra15          ; mul24 r0, r0, r1
-+-/* [0x000009a0] */ 0x0f9c81c0, 0xd00203e7, // asr ra15, r0, 8         ; nop
-+-/* [0x000009a8] */ 0x009e7000, 0x100009e7, // nop                     ; nop
-+-/* [0x000009b0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009b8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009c0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009c8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000009e8] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x000009f0] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x000009f8] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a00] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a08] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a10] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a18] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a20] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a28] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a30] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a40] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a48] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a50] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a58] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a68] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_interrupt_exit8
-++/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+ /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a90] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-// ::mc_interrupt_exit8
-+-/* [0x00000aa8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b10] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 1fb3e37..3fac45f 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -5,10 +5,10 @@ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 318)
-+-#define mc_filter_uv_b (rpi_shader + 484)
-+-#define mc_exit (rpi_shader + 664)
-+-#define mc_interrupt_exit8 (rpi_shader + 682)
-+-#define mc_end (rpi_shader + 712)
-++#define mc_filter_uv_b0 (rpi_shader + 316)
-++#define mc_filter_uv_b (rpi_shader + 476)
-++#define mc_exit (rpi_shader + 650)
-++#define mc_interrupt_exit8 (rpi_shader + 668)
-++#define mc_end (rpi_shader + 698)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 02e95dd..10f5113 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -221,8 +221,6 @@ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb26, r0, rb27
-+ 
-+-sub.setf -,8,r1 # 8-r1, so if <0 (negative) we need to use the full code
-+-
-+ # get filter coefficients
-+ 
-+ mov r0, unif
-+@@ -410,20 +408,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 4 ; mov r1, ra22
-+-
-+-# apply horizontal filter
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 4    ; mov ra12, ra13
-+ brr.anyn -, r:uvloop_b0
-+-mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll
-+-asr ra15, r0, 8         ; nop  # TODO isn't ra15 already in 24bit precision, may not need the sign extension here?
-+-nop                     ; nop  # Delay slot 3 (TODO move more of the context scroll into here)
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+@@ -432,9 +422,9 @@ nop                     ; mul24 r0, ra13, rb9
-+ add r1, r1, r0          ; mul24 r0, ra12, rb8
-+ add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++sub.setf -, r3, rb18
-+ brr.anyn -, r:uvloop_b0
-+-asr vpm, r1, 14        # Delay 1 shifts down by shift2=6, but results are still in 16bit precision TODO may be able to avoid the mul24 and use more delay slots
-++asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still in 16bit precision
-+ nop                    # Delay 2
-+ nop                    # Delay 3
-+ 
-+@@ -554,19 +544,12 @@ add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+ nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+ add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+ nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r0, r2, r3
-+-
-+-mov r3, rb31
-+-
-+-mov ra12, ra13
-+-mov ra13, ra14
-+-
-+-sub.setf -, r3, 4 ; mov r1, ra22
-+-# apply horizontal filter
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 4    ; mov ra12, ra13
-+ brr.anyn -, r:uvloop_b
-+-mov ra14, ra15          ; mul24 r0, r0, r1         # last bit of context scroll, including clamp to zero
-+-asr ra15, r0, 8         ; nop
-+-nop                     ; nop    # TODO improve use of delay slots
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 29956c5549eb94e418c42e838d0bfceeb95730b0 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:31:23 +0100
-+Subject: [PATCH 32/68] Avoid writeback of first B results
-+
-+---
-+ libavcodec/rpi_shader.c    | 229 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |   8 +-
-+ libavcodec/rpi_shader.qasm |  18 +---
-+ 3 files changed, 121 insertions(+), 134 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index c498f28..ba453a2 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -255,130 +255,125 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+ /* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-+ /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000728] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000730] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000738] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000740] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000748] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000750] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000758] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000760] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000768] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x00000770] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000778] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000780] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000788] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000790] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000798] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007a0] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007a8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007b0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000007b8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007c0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000007c8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000007d0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000007d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007e0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000007e8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000007f0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000007f8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000800] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000808] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000810] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000818] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000820] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000828] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000830] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000838] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000840] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+ /* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000850] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000858] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000860] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000868] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000878] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000880] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000888] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000890] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000898] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008a8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008b0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008b8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000008c0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000008c8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000008d0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000008d8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000008e0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000008e8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000008f0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000008f8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000900] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000908] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000910] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000918] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000920] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000928] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000930] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000938] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000940] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000948] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000950] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000958] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000960] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000968] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000970] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000978] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000980] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000988] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000990] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000998] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000009b0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x000009b8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x000009c0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009c8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x000009d0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000009d8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000009e0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000009e8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009f0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000009f8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a08] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a10] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a28] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a30] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a48] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a58] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a60] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000a68] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000a70] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000aa8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ab0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ab8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ac0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ad0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ad8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 3fac45f..45dbe0e 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -6,9 +6,9 @@ extern unsigned int rpi_shader[];
-+ #define mc_setup_uv (rpi_shader + 0)
-+ #define mc_filter_uv (rpi_shader + 152)
-+ #define mc_filter_uv_b0 (rpi_shader + 316)
-+-#define mc_filter_uv_b (rpi_shader + 476)
-+-#define mc_exit (rpi_shader + 650)
-+-#define mc_interrupt_exit8 (rpi_shader + 668)
-+-#define mc_end (rpi_shader + 698)
-++#define mc_filter_uv_b (rpi_shader + 466)
-++#define mc_exit (rpi_shader + 640)
-++#define mc_interrupt_exit8 (rpi_shader + 658)
-++#define mc_end (rpi_shader + 688)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 10f5113..e138c95 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -428,22 +428,14 @@ asr vpm, r1, 6         # Delay 1 shifts down by shift2=6, but results are still
-+ nop                    # Delay 2
-+ nop                    # Delay 3
-+ 
-++# in pass0 we don't really need to save any results, but need to discard the uniforms
-+ # DMA out for U
-+ 
-+-mov vw_setup, rb26 # VDW setup 0
-+-mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW    # TODO in pass0 we don't need to save any results
-+-
-+-# DMA out for V
-+-# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+-# Could potentially push this write into the start of the next pipeline stage.
-+-mov r0, 16
-+-mov -, vw_wait
-+-
-+ bra -, ra31
-+-add vw_setup, rb26, r0 # VDW setup 0
-+-mov vw_setup, rb29 # Stride
-+-mov vw_addr, unif # start the VDW
-++mov r0, unif           # Delay 1
-++mov r0, unif           # Delay 2
-++nop                    # Delay 3
-++
-+ 
-+ ################################################################################
-+ 
-+-- 
-+2.5.0
-+
-+
-+From c184ce179f16ca497ed003805193651fa3b30817 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 14 May 2015 11:36:24 +0100
-+Subject: [PATCH 33/68] Cutdown size of chroma prediction commands
-+
-+---
-+ libavcodec/hevc.c          |  17 +-
-+ libavcodec/rpi_shader.c    | 543 ++++++++++++++++++++++-----------------------
-+ libavcodec/rpi_shader.h    |  12 +-
-+ libavcodec/rpi_shader.qasm |  11 +-
-+ 4 files changed, 281 insertions(+), 302 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 16f2200..da81a54 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -54,7 +54,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ 
-+ #ifdef RPI_INTER_QPU
-+ 
-+-#define RPI_CHROMA_COMMAND_WORDS 12
-++#define RPI_CHROMA_COMMAND_WORDS 10
-+ #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+@@ -2019,11 +2019,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+-                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2078,9 +2075,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2141,11 +2136,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+-                      u++;
-+                       *u++ = rpi_filter_coefs[_my][0];
-+-                      u++;
-+-                      *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); // TODO this will become unused once we have a dedicated pass0 filter
-+-                      *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-++                      u+=2; // Intermediate results are not written back in first pass of B filtering
-+ 
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-+@@ -2153,11 +2145,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+-                      // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx2][0];
-+-                      u++;
-+                       *u++ = rpi_filter_coefs[_my2][0];
-+-                      u++;
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2795,7 +2784,7 @@ static void rpi_inter_clear(HEVCContext *s)
-+         *s->u_mvs[i]++ = pic_height;
-+         *s->u_mvs[i]++ = s->frame->linesize[1];
-+         *s->u_mvs[i]++ = s->frame->linesize[2];
-+-        s->u_mvs[i] += 3;  // Padding words
-++        s->u_mvs[i] += 1;  // Padding words
-+     }
-+ }
-+ 
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index ba453a2..b0b93b5 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -89,291 +89,286 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+ /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+ /* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000218] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000340] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000348] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000360] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000368] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000370] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000378] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000380] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000388] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000390] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000398] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000003e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000003f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000003f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000400] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000408] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000410] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000418] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000420] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000428] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000430] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000438] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000440] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000448] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000450] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000458] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000460] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000468] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000470] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000478] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000480] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000488] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000490] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000498] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000004a8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000004b0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004b8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000004c0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000004c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000004d0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000004d8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000004e0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004e8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-++/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000004f0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000004f8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000500] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000508] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000510] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000518] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000520] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000528] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000530] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000538] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000540] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000548] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000550] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000558] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000560] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000568] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000570] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000578] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000580] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000588] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000590] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000598] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005b0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005b8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005c0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005c8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005d8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005e0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005e8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005f0] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000005f8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000600] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000608] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000610] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000618] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000620] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000628] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000630] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000638] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000640] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000648] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000650] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000658] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000668] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000670] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000678] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000680] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000688] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000690] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000698] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000006a0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000006a8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x000006b0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x000006b8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006c0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000006c8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000006d0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000006d8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000006e0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000006e8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000006f0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000006f8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000700] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x00000708] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000710] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
-+ /* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000728] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000738] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000740] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x00000748] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000750] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000758] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000760] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000768] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000770] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000778] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000780] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000788] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000790] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000798] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000007a0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000007a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000007b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000007c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000007c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000007d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000007d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000007e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000007e8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x000007f0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x000007f8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000800] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000808] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000810] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000818] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000828] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000830] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000838] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000840] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000848] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000850] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000858] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000860] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000868] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000890] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000008d0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a18] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a58] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 45dbe0e..99927c4 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,11 +4,11 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 316)
-+-#define mc_filter_uv_b (rpi_shader + 466)
-+-#define mc_exit (rpi_shader + 640)
-+-#define mc_interrupt_exit8 (rpi_shader + 658)
-+-#define mc_end (rpi_shader + 688)
-++#define mc_filter_uv (rpi_shader + 148)
-++#define mc_filter_uv_b0 (rpi_shader + 310)
-++#define mc_filter_uv_b (rpi_shader + 458)
-++#define mc_exit (rpi_shader + 630)
-++#define mc_interrupt_exit8 (rpi_shader + 648)
-++#define mc_end (rpi_shader + 678)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index e138c95..d9ffcda 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -167,8 +167,6 @@ add t0s, r2, r1
-+ 
-+ # Dump padding words
-+ mov r0, unif
-+-mov r0, unif
-+-mov r0, unif
-+ 
-+ # submit texture requests for second line
-+ max r1, ra_y, 0
-+@@ -228,11 +226,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-++asr rb8, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -362,11 +359,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-++asr rb8, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+@@ -490,11 +486,10 @@ asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+ asr ra0, r0, rb23;      mov r0, unif
-+-                        mov r0, unif
-+ asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-++asr rb8, r0, rb23
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+-- 
-+2.5.0
-+
-+
-+From 5edce4e2a69b82aceb72f331737b5b00bf3af912 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 15:21:49 +0100
-+Subject: [PATCH 34/68] hevc: don't redirect when not rpi_enabled
-+
-+---
-+ libavcodec/hevc.c | 2 +-
-+ 1 file changed, 1 insertion(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index da81a54..60b3d97 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -1455,7 +1455,7 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-+  */
-+ 
-+ #ifdef RPI_INTER
-+-#define RPI_REDIRECT(fn) rpi_ ## fn
-++#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
-+ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                         int block_w, int block_h, int luma_weight, int luma_offset)
-+-- 
-+2.5.0
-+
-+
-+From 86652e6a111a593a8c14c8eecaa7e26a068febcf Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 15:22:02 +0100
-+Subject: [PATCH 35/68] Use /dev/vcio for mailbox access
-+
-+---
-+ libavcodec/rpi_mailbox.c | 2 +-
-+ 1 file changed, 1 insertion(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-+index 536896f..77a56dd 100644
-+--- a/libavcodec/rpi_mailbox.c
-++++ b/libavcodec/rpi_mailbox.c
-+@@ -39,7 +39,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ 
-+ #define MAJOR_NUM 100
-+ #define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+-#define DEVICE_FILE_NAME "/dev/char_dev"
-++#define DEVICE_FILE_NAME "/dev/vcio"
-+ 
-+ #include "rpi_mailbox.h"
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 05eb83c2f257c17a02abc01a6be6ae9df2d8e653 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 15:25:25 +0100
-+Subject: [PATCH 36/68] Use vcsm for all memory allocations
-+
-+---
-+ libavcodec/rpi_qpu.c | 174 +++++++++++++++++++--------------------------------
-+ 1 file changed, 64 insertions(+), 110 deletions(-)
-+
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 60bf079..f62051f 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -1,7 +1,5 @@
-+ #ifdef RPI
-+-// define RPI_USE_VCSM to use the vcsm device for shared memory
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+-#define RPI_USE_VCSM
-+ // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+ #define RPI_TIME_TOTAL_QPU
-+ // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+@@ -25,9 +23,7 @@
-+ #include "rpi_shader.h"
-+ #include "rpi_hevc_transform.h"
-+ 
-+-#ifdef RPI_USE_VCSM
-+ #include "rpi_user_vcsm.h"
-+-#endif
-+ 
-+ // On Pi2 there is no way to access the VPU L2 cache
-+ // GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+@@ -96,7 +92,6 @@ struct GPU
-+   unsigned int vpu_code[VPU_CODE_SIZE];
-+   short transMatrix2even[16*16*2];
-+   int open_count; // Number of allocated video buffers
-+-  unsigned int vc_handle; // Handle of this memory
-+   int      mb; // Mailbox handle
-+   int      vc; // Address in GPU memory
-+   int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-+@@ -105,6 +100,7 @@ struct GPU
-+ // Stop more than one thread trying to allocate memory or use the processing resources at once
-+ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ static volatile struct GPU* gpu = NULL;
-++static GPU_MEM_PTR_T gpu_mem_ptr;
-+ 
-+ #if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
-+ static unsigned int Microseconds(void) {
-+@@ -132,39 +128,27 @@ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-+ static volatile int vpu_async_head=0;
-+ #endif
-+ 
-++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
-++static void gpu_free_internal(GPU_MEM_PTR_T *p);
-++
-+ // Connect to QPU, returns 0 on success.
-+ static int gpu_init(volatile struct GPU **gpu) {
-+   int mb = mbox_open();
-+   int vc;
-+-  int handle;
-+   volatile struct GPU* ptr;
-+ 	if (mb < 0)
-+ 		return -1;
-+ 
-+ 	if (qpu_enable(mb, 1)) return -2;
-+ 
-+-#ifdef RPI_USE_VCSM
-+   vcsm_init();
-+-#endif
-++  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-++  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-++  memset(ptr, 0, sizeof *ptr);
-++  vc = gpu_mem_ptr.vc;
-+ 
-+-  handle = mem_alloc(mb, sizeof(struct GPU), 4096, GPU_MEM_FLG);
-+-  if (!handle)
-+-  {
-+-    qpu_enable(mb, 0);
-+-    return -3;
-+-  }
-+-	vc = mem_lock(mb, handle);
-+-	ptr = mapmem_shared((vc+GPU_MEM_MAP)&~0xc0000000, sizeof(struct GPU));
-+-	if (ptr == NULL)
-+-	{	mem_free(mb, handle);
-+-		mem_unlock(mb, handle);
-+-		qpu_enable(mb, 0);
-+-		return -4;
-+-	}
-+-
-+-	ptr->mb = mb;
-+-	ptr->vc_handle = handle;
-+-	ptr->vc = vc;
-++  ptr->mb = mb;
-++  ptr->vc = vc;
-+ 
-+   printf("GPU allocated at 0x%x\n",vc);
-+ 
-+@@ -226,94 +210,74 @@ static void gpu_unlock(void) {
-+   pthread_mutex_unlock(&gpu_mutex);
-+ }
-+ 
-++static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-++  assert(p->vcsm_handle);
-++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-++  assert(p->vc_handle);
-++  p->arm = vcsm_lock(p->vcsm_handle);
-++  assert(p->arm);
-++  p->vc = mem_lock(mb, p->vc_handle);
-++  assert(p->vc);
-++  return 0;
-++}
-++
-+ // Allocate memory on GPU
-+ // Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-+ // Returns 0 on success.
-+ // This allocates memory that will not be cached in ARM's data cache.
-+ // Therefore safe to use without data cache flushing.
-+-int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) {
-++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-++{
-++  int r;
-+   gpu_lock();
-+-  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-+-  p->vcsm_handle = 0;
-+-  if (!p->vc_handle)
-+-  {
-+-    qpu_enable(gpu->mb, 0);
-+-    return -3;
-+-  }
-+-  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+-  p->arm = mapmem_shared((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-+-  p->numbytes = numbytes;
-+-  if (p->arm == NULL)
-+-  {
-+-    mem_free(gpu->mb, p->vc_handle);
-+-    mem_unlock(gpu->mb, p->vc_handle);
-+-    gpu_unlock();
-+-    qpu_enable(gpu->mb, 0);
-+-    return -4;
-+-  }
-++  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
-+   gpu->open_count++;
-+   gpu_unlock();
-+-  return 0;
-++  return r;
-+ }
-+ 
-+ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ {
-+-  // This only works when using RPI_USE_VCSM
-+   void *tmp = vcsm_lock(p->vcsm_handle);
-+   vcsm_unlock_ptr(tmp);
-+ }
-+ 
-++static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-++  assert(p->vcsm_handle);
-++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-++  assert(p->vc_handle);
-++  p->arm = vcsm_lock(p->vcsm_handle);
-++  assert(p->arm);
-++  p->vc = mem_lock(gpu->mb, p->vc_handle);
-++  assert(p->vc);
-++  return 0;
-++}
-++
-+ // This allocates data that will be
-+ //    Cached in ARM L2
-+ //    Uncached in VPU L2
-+-int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) {
-++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-++{
-++  int r;
-+   gpu_lock();
-+-#ifdef RPI_USE_VCSM
-+-  {
-+-      p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); // f....... locks up for VP9 - retest this?
-+-      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); // 3b...... works
-+-      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); //fb...... locks up
-+-      //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); // 3b works (but corrupted due to caching)
-+-      p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+-      p->arm = vcsm_lock(p->vcsm_handle);
-+-      p->vc = mem_lock(gpu->mb, p->vc_handle);
-+-  }
-+-#else
-+-  p->vc_handle = mem_alloc(gpu->mb, numbytes, 4096, GPU_MEM_FLG);
-+-  p->vcsm_handle = 0;
-+-  if (!p->handle)
-+-  {
-+-    qpu_enable(gpu->mb, 0);
-+-    return -3;
-+-  }
-+-  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+-  printf("This mapmem_private does not seem to work\n");
-+-  exit(-1);
-+-  p->arm = mapmem_private((p->vc+GPU_MEM_MAP)&~0xc0000000,numbytes);
-+-  p->numbytes = numbytes;
-+-  if (p->arm == NULL)
-+-  {
-+-    mem_free(gpu->mb, p->handle);
-+-    mem_unlock(gpu->mb, p->handle);
-+-    gpu_unlock();
-+-    qpu_enable(gpu->mb, 0);
-+-    return -4;
-+-  }
-+-#endif
-++  r = gpu_malloc_cached_internal(numbytes, p);
-+   gpu->open_count++;
-+   gpu_unlock();
-+-  return 0;
-++  return r;
-+ }
-+ 
-+ static void gpu_term(void)
-+ {
-+-	int mb;
-+-	unsigned handle;
-++  int mb;
-+ 
-+   if (gpu==NULL)
-+     return;
-+   mb = gpu->mb;
-+-  handle = gpu->vc_handle;
-+ 
-+ #ifdef RPI_ASYNC
-+   {
-+@@ -323,37 +287,26 @@ static void gpu_term(void)
-+   }
-+ #endif
-+ 
-++  qpu_enable(mb, 0);
-++  gpu_free_internal(&gpu_mem_ptr);
-+ 
-+-	unmapmem((void*)gpu, sizeof(struct GPU));
-+-	mem_unlock(mb, handle);
-+-	mem_free(mb, handle);
-+-	qpu_enable(mb, 0);
-+-#ifdef RPI_USE_VCSM
-+   vcsm_exit();
-+-#endif
-+-	mbox_close(mb);
-++
-++  mbox_close(mb);
-+   gpu = NULL;
-+ }
-+ 
-+-void gpu_free(GPU_MEM_PTR_T *p) {
-++void gpu_free_internal(GPU_MEM_PTR_T *p) {
-+   int mb = gpu->mb;
-+-	unsigned handle = p->vc_handle;
-++  mem_unlock(mb,p->vc_handle);
-++  vcsm_unlock_ptr(p->arm);
-++  vcsm_free(p->vcsm_handle);
-++}
-++
-++void gpu_free(GPU_MEM_PTR_T *p) {
-+   gpu_lock();
-+-#ifdef RPI_USE_VCSM
-+-  if (p->vcsm_handle) {
-+-      mem_unlock(mb,p->vc_handle);
-+-      vcsm_unlock_ptr(p->arm);
-+-      vcsm_free(p->vcsm_handle);
-+-  } else {
-+-	unmapmem((void*)p->arm, sizeof(struct GPU));
-+-      mem_unlock(mb, handle);
-+-      mem_free(mb, handle);
-+-  }
-+-#else
-+-	unmapmem((void*)p->arm, sizeof(struct GPU));
-+-	mem_unlock(mb, handle);
-+-	mem_free(mb, handle);
-+-#endif
-++
-++  gpu_free_internal(p);
-+ 
-+   gpu->open_count--;
-+   if (gpu->open_count==0) {
-+@@ -386,20 +339,21 @@ unsigned int vpu_get_constants(void) {
-+ 
-+ static void *vpu_start(void *arg) {
-+   while(1) {
-++    int *p;
-+     pthread_mutex_lock(&post_mutex);
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+       //printf("Checking number %d %d\n",vpu_async_head,vpu_async_tail);
-+       pthread_cond_wait(&post_cond_tail, &post_mutex);
-+     }
-+-    int *p = vpu_cmds[vpu_async_head%MAXCMDS];
-++    p = vpu_cmds[vpu_async_head%MAXCMDS];
-+     pthread_mutex_unlock(&post_mutex);
-+ 
-+     if (p[6] == -1) {
-+       break; // Last job
-+     }
-+     if (p[7]) {
-+-        GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-++        //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+         //gpu_cache_flush(buf);
-+     }
-+     vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+-- 
-+2.5.0
-+
-+
-+From 72b441dc9a9965ce3e5812be87081ffae1e166de Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 15:43:17 +0100
-+Subject: [PATCH 37/68] Enable EARLY_MALLOC and fix sps access bug
-+
-+---
-+ libavcodec/hevc.c | 5 +++--
-+ 1 file changed, 3 insertions(+), 2 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 60b3d97..eee22eb 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -42,7 +42,7 @@
-+ #ifdef RPI
-+   #include "rpi_qpu.h"
-+   // For some unknown reason, the code seems to crash if I do a late malloc
-+-  #define EARLY_MALLOC
-++  //#define EARLY_MALLOC
-+   // Move Inter prediction into separate pass
-+   #define RPI_INTER
-+ #endif
-+@@ -147,7 +147,8 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+ #ifdef RPI
-+ #ifdef EARLY_MALLOC
-+ #else
-+-    int coeffs_in_ctb = (1 << s->ps.sps->log2_ctb_size) * (1 << s->ps.sps->log2_ctb_size);
-++    assert(sps);
-++    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-+     printf("pic_arrays_init\n");
-+     printf("Allocated %d\n",coefs_per_row);
-+-- 
-+2.5.0
-+
-+
-+From 6a0001e44872f9333caf6c6e7e5046cd56a3a21a Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 14 May 2015 16:40:51 +0100
-+Subject: [PATCH 38/68] Add copy of av_mod_uintp2 for use with stable ffmpeg
-+
-+---
-+ libavcodec/hevc.c | 8 ++++++++
-+ 1 file changed, 8 insertions(+)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index eee22eb..cfdf6c2 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -49,6 +49,14 @@
-+ 
-+ // #define DISABLE_MC
-+ 
-++#ifndef av_mod_uintp2
-++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-++{
-++    return a & ((1 << p) - 1);
-++}
-++#   define av_mod_uintp2   av_mod_uintp2_c
-++#endif
-++
-+ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ 
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 6fbc046c87e413d38c789e82f73dfece27a64ff4 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 18 May 2015 11:11:02 +0100
-+Subject: [PATCH 39/68] Added support for weighted prediction in P frames
-+
-+---
-+ libavcodec/hevc.c          |  52 ++++-
-+ libavcodec/rpi_shader.c    | 566 +++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |  12 +-
-+ libavcodec/rpi_shader.qasm |  39 +++-
-+ 4 files changed, 384 insertions(+), 285 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index cfdf6c2..0906ac2 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -62,7 +62,7 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ 
-+ #ifdef RPI_INTER_QPU
-+ 
-+-#define RPI_CHROMA_COMMAND_WORDS 10
-++#define RPI_CHROMA_COMMAND_WORDS 12
-+ #define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+@@ -2018,6 +2018,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+                 int chan = x0>>8;
-++                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+@@ -2030,6 +2032,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-++                      if (weight_flag) {
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0] & 0xffff);
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1] & 0xffff);
-++                      } else {
-++                          *u++ = 1; // Weight of 1 and offset of 0
-++                          *u++ = 1;
-++                      }
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2072,6 +2081,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+                 //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+                 int chan = x0>>8;
-++                int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                                       (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+@@ -2085,6 +2096,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-++                      if (weight_flag) {
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
-++                      } else {
-++                          *u++ = 1; // Weight of 1 and offset of 0
-++                          *u++ = 1;
-++                      }
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2146,6 +2164,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-++                      u+=2; // Weights not supported in B slices
-+                       u+=2; // Intermediate results are not written back in first pass of B filtering
-+ 
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+@@ -2156,6 +2175,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-+                       *u++ = rpi_filter_coefs[_mx2][0];
-+                       *u++ = rpi_filter_coefs[_my2][0];
-++                      u+=2; // Weights not supported in B slices
-+                       *u++ = (get_vc_address(s->frame->buf[1]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+@@ -2782,6 +2802,9 @@ static void rpi_inter_clear(HEVCContext *s)
-+     int i;
-+     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-++    int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                           (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-++
-+     for(i=0;i<8;i++) {
-+         s->u_mvs[i] = s->mvs_base[i];
-+         *s->u_mvs[i]++ = 0;
-+@@ -2793,6 +2816,13 @@ static void rpi_inter_clear(HEVCContext *s)
-+         *s->u_mvs[i]++ = pic_height;
-+         *s->u_mvs[i]++ = s->frame->linesize[1];
-+         *s->u_mvs[i]++ = s->frame->linesize[2];
-++        if (weight_flag) {
-++            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-++            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-++        } else {
-++            *s->u_mvs[i]++ = 1 << 5;
-++            *s->u_mvs[i]++ = 6;
-++        }
-+         s->u_mvs[i] += 1;  // Padding words
-+     }
-+ }
-+@@ -2836,12 +2866,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+ 
-+ #ifdef RPI
-++#ifdef RPI_INTER_QPU
-+     s->enable_rpi = s->ps.sps->bit_depth == 8
-+                     && s->ps.sps->width <= RPI_MAX_WIDTH
-+                     && !s->ps.pps->cross_component_prediction_enabled_flag
-+                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
-+-                    && !(s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-+                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
-++#else
-++    s->enable_rpi = s->ps.sps->bit_depth == 8
-++                    && s->ps.sps->width <= RPI_MAX_WIDTH
-++                    && !s->ps.pps->cross_component_prediction_enabled_flag
-++                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
-++#endif
-++
-++    /*if (!s->enable_rpi) {
-++      if (s->ps.pps->cross_component_prediction_enabled_flag)
-++        printf("Cross component\n");
-++      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
-++        printf("Tiles\n");
-++      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-++        printf("Weighted P slice\n");
-++      if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-++        printf("Weighted B slice\n");
-++    }*/
-+ 
-+ #endif
-+ 
-+@@ -2974,6 +3021,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-+ 
-+ #ifdef RPI
-+     s->enable_rpi = 0;
-++    //printf("Wavefront\n");
-+ #endif
-+ 
-+     if(ctb_row) {
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index b0b93b5..3f04d80 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -88,287 +88,307 @@ unsigned int rpi_shader[] = {
-+ /* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+ /* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+ /* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-++/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-+ // ::mc_filter_uv
-+-/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000350] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000360] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-++/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-++/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000368] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000370] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000378] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000380] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000388] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000390] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000398] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003a8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003b0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000003b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000003d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000003d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000003e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000003e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000003f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000003f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000400] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000408] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000410] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000418] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000420] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000428] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000430] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000438] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000440] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000448] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000450] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000458] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000460] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000468] */ 0x0c567380, 0x10020867, // add r1, r1, ra21
-+-/* [0x00000470] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000478] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+-/* [0x00000480] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000488] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000490] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000498] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000004a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000004b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000004b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000004c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000004c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000004d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
-++/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
-++/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
-++/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
-++/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x000004d8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000004e0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000004e8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000004f0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000004f8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000500] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000508] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000510] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000518] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000520] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000528] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000530] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000538] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000540] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000548] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000550] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000558] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000560] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000568] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000570] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000578] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000580] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000588] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000590] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000598] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000005b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000005c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000005d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x000005d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000005e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000005e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x000005f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000005f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000600] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000608] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000610] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000618] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000620] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000628] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000630] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000638] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000640] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000648] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000650] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000658] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000660] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000668] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000670] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000678] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000680] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000688] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000690] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000698] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000006a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000006b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000006b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000006c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000006c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000006d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000006d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000006e0] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x000006e8] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x000006f0] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x000006f8] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000700] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000708] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000710] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000718] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000720] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x00000728] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000730] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000738] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000740] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000748] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000750] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000758] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000760] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000768] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000770] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000778] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000780] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000788] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000798] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000007a0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000007a8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000007b0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000007b8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000007c0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000007c8] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x000007d0] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x000007d8] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x000007e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000007e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000007f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000007f8] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000808] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000810] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000818] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000820] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000828] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000830] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000838] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000840] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000848] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000850] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000858] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-++/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-++/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000860] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000868] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000870] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000878] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000880] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000888] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000890] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000898] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000008a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000008a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000008b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000008c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000008c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000008d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000008d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000008e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000008e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000008f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000008f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000900] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000908] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000910] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000918] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000920] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000928] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000930] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000938] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000940] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000948] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000950] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x00000958] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000960] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000968] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000970] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000978] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000980] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000988] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000990] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000998] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x000009a8] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x000009b0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x000009b8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000009c0] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x000009c8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x000009d0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-++/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-++/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-++/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x000009e0] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x000009f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a08] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000a18] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000a20] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a28] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a30] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a38] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a40] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a48] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a50] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a58] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a60] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a68] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000a80] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a88] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 99927c4..cec9901 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,11 +4,11 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 148)
-+-#define mc_filter_uv_b0 (rpi_shader + 310)
-+-#define mc_filter_uv_b (rpi_shader + 458)
-+-#define mc_exit (rpi_shader + 630)
-+-#define mc_interrupt_exit8 (rpi_shader + 648)
-+-#define mc_end (rpi_shader + 678)
-++#define mc_filter_uv (rpi_shader + 152)
-++#define mc_filter_uv_b0 (rpi_shader + 342)
-++#define mc_filter_uv_b (rpi_shader + 494)
-++#define mc_exit (rpi_shader + 670)
-++#define mc_interrupt_exit8 (rpi_shader + 688)
-++#define mc_end (rpi_shader + 718)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index d9ffcda..97c4c02 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -9,7 +9,12 @@
-+ #                                               (ra15 isn't clamped to zero - this happens during the
-+ #                                                copy to ra14, and during its use in the vertical filter)
-+ #
-+-# rb8...rb15                                    eight vertical filter coefficients
-++# rb8...rb11                                    eight vertical filter coefficients
-++
-++# rb12 offset to add before shift
-++# rb13 shift
-++# rb14 weight (U on left, V on right)
-++# rb15 offset (U on left, V on right)
-+ #
-+ # ra16                                          clipped(row start address+elem_num)&~3
-+ # ra17                                          per-channel shifts
-+@@ -165,6 +170,9 @@ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+ add t0s, r0, r1 ; mov ra_x2_base, r2
-+ add t0s, r2, r1
-+ 
-++mov rb12,unif # offset before shift
-++mov rb13,unif # offset after shift
-++
-+ # Dump padding words
-+ mov r0, unif
-+ 
-+@@ -231,11 +239,21 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23
-+ 
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++mov r0, unif # U offset/weight
-++asr rb15, r0, r2  # Compute offset from MSBs
-++shl r0, r0, r2
-++asr rb14, r0, r2  # Compute weight from LSBs
-++mov r0, unif # V offset/weight
-++asr.ifnz rb15, r0, r2
-++shl r0, r0, r2
-++asr.ifnz rb14, r0, r2
-++
-+ # r2 is elem_num
-+ # r3 is loop counter
-+ 
-+ mov r5rep, -8
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+ # retrieve texture results and pick out bytes
-+ # then submit two more texture requests
-+@@ -279,6 +297,11 @@ mov ra13, ra14       # Delay slot 1
-+ mov ra14, ra15       # Delay slot 2
-+ mov ra15, r0         # Delay slot 3
-+ 
-++mov rb12,32
-++mov rb13,6
-++mov rb14,1
-++mov rb15,0
-++
-+ # apply vertical filter and write to VPM
-+ 
-+ nop                     ; mul24 r1, ra14, rb10
-+@@ -288,9 +311,11 @@ add r1, r1, r0          ; mul24 r0, ra15, rb11
-+ add r1, r1, r0          ; mov -, vw_wait
-+ sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+ asr r1, r1, 14
-+-add r1, r1, ra21
-++nop                     ; mul24 r1, r1, rb14
-++add r1, r1, rb12
-++asr r1, r1, rb13
-+ brr.anyn -, r:uvloop
-+-asr r1, r1, 6          # Delay 1
-++add r1, r1, rb15       # Delay 1
-+ min r1, r1, rb22       # Delay 2
-+ max vpm, r1, 0         # Delay 3
-+ 
-+@@ -364,6 +389,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23
-+ 
-++mov r0, unif # U offset/weight
-++mov r0, unif # V offset/weight
-++
-+ # r2 is elem_num
-+ # r3 is loop counter
-+ 
-+@@ -491,6 +519,9 @@ asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+ asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+ asr rb8, r0, rb23
-+ 
-++mov r0, unif # U offset/weight
-++mov r0, unif # V offset/weight
-++
-+ # r2 is elem_num
-+ # r3 is loop counter
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 21506fa30a0953a1caa467ea31f9ee2ebbdf5b79 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 19 May 2015 08:43:30 +0100
-+Subject: [PATCH 40/68] Improved ordering of tasks
-+
-+---
-+ libavcodec/hevc.c | 8 ++++----
-+ 1 file changed, 4 insertions(+), 4 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 0906ac2..1cc7900 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2930,15 +2930,15 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+           s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-+           s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-+           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+-#ifdef RPI_INTER_QPU
-+-            // Kick off inter prediction on QPUs
-+-            rpi_execute_inter_qpu(s);
-+-#endif
-+             // Transform all blocks
-+             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-++#ifdef RPI_INTER_QPU
-++            // Kick off inter prediction on QPUs
-++            rpi_execute_inter_qpu(s);
-++#endif
-+             // Wait for transform completion
-+             vpu_wait(s->vpu_id);
-+ 
-+-- 
-+2.5.0
-+
-+
-+From c0437176b4799453c7731e8372c622c03d65db1b Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 20 May 2015 19:58:19 +0100
-+Subject: [PATCH 41/68] Drafted Luma inter prediction
-+
-+---
-+ libavcodec/rpi_shader.qasm | 594 ++++++++++++++++++++++++++++++++++++++++++---
-+ 1 file changed, 554 insertions(+), 40 deletions(-)
-+
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 97c4c02..9cfc0d9 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -2,7 +2,10 @@
-+ #
-+ # ra0...ra7                                     eight horizontal filter coefficients
-+ #
-+-# rb1...rb7                                     seven shifted copies of the current unfiltered row
-++# rb0 rx_shift2
-++# rb1 ra_y2_next
-++#
-++# rb4...rb7
-+ #
-+ # ra8...ra15                                    eight filtered rows of context (rb15 == most recent)
-+ #
-+@@ -26,9 +29,9 @@
-+ # rb19                                          next ra16
-+ #
-+ # ra20                                          1
-+-# ra21                                          32
-++# ra21                                          ra_21
-+ # ra22                                          256
-+-# ra23                                          8
-++# ra23                                          rx_shift2_next
-+ #
-+ # rb20                                          0xffffff00
-+ # rb21                                          vpm_setup for reading/writing 16bit results into VPM
-+@@ -57,16 +60,23 @@
-+ .set rb_frame_width_minus_1,       rb25
-+ .set rb_frame_height_minus_1,      rb30
-+ .set rb_pitch,                     rb16
-+-.set ra_x_base,                    ra16
-+-.set rb_x_base_next,               rb19
-+-.set ra_x2_base,                   ra24
-+-.set ra_x2_base_next,              ra26
-++.set ra_x,                         ra16
-++.set ra_y2,                        ra21
-++.set ra_y2_next,                   rb1
-++
-++.set rb_x_next,                    rb19
-++.set rx_frame_base2_next,          rb19
-++
-++.set ra_frame_base,                ra24
-++.set ra_frame_base_next,           ra26
-+ .set ra_xshift,                    ra17
-+ 
-+-.set ra_x2shift,                   ra25
-+ .set ra_u2v_ref_offset,            ra25
-++.set ra_frame_base2,               ra25
-+ 
-+ .set ra_xshift_next,               ra19
-++.set rx_xshift2,                   rb0
-++.set rx_xshift2_next,              ra23
-+ 
-+ .set ra_x2shift_next,              ra27
-+ .set ra_u2v_dst_offset,            ra27
-+@@ -83,11 +93,11 @@
-+ mov ra31, unif
-+ 
-+ # Load first request location
-+-add ra_x_base, unif, elem_num # Store x
-++add ra_x, unif, elem_num # Store x
-+ mov ra_y, unif # Store y
-+-mov ra_x2_base, unif # Store frame u base
-++mov ra_frame_base, unif # Store frame u base
-+ nop
-+-sub ra_u2v_ref_offset, unif, ra_x2_base # Store offset to add to move from u to v in reference frame
-++sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
-+ 
-+ # Read image dimensions
-+ sub rb25,unif,1
-+@@ -104,9 +114,7 @@ add rb24, r1, r0
-+ # load constants
-+ 
-+ mov ra20, 1
-+-mov ra21, 32
-+ mov ra22, 256
-+-mov ra23, 8
-+ mov ra30, 64
-+ 
-+ mov rb20, 0xffffff00
-+@@ -156,18 +164,18 @@ mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which i
-+ add rb21, r0, r1
-+ 
-+ # Compute base address for first and second access
-+-mov r0, ra_x_base           # Load x
-++mov r0, ra_x           # Load x
-+ max r0, r0, 0; mov r1, ra_y # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base  # Load the frame base
-++min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
-+ shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+ add ra_y, r1, 1
-+ add r0, r0, r3
-+ and r0, r0, ~3
-+-max r1, r1, 0 ; mov ra_x_base, r0 # y
-++max r1, r1, 0 ; mov ra_x, r0 # y
-+ min r1, r1, rb_frame_height_minus_1
-+ # submit texture requests for first line
-+ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-add t0s, r0, r1 ; mov ra_x2_base, r2
-++add t0s, r0, r1 ; mov ra_frame_base, r2
-+ add t0s, r2, r1
-+ 
-+ mov rb12,unif # offset before shift
-+@@ -182,8 +190,8 @@ min r1, r1, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1
-+ bra -, ra31
-+ nop ; mul24 r1, r1, rb_pitch
-+-add t0s, r1, ra_x_base
-+-add t0s, r1, ra_x2_base
-++add t0s, r1, ra_x
-++add t0s, r1, ra_frame_base
-+ 
-+ 
-+ 
-+@@ -192,7 +200,7 @@ add t0s, r1, ra_x2_base
-+ # mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+ 
-+ # At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-++# ra_x, ra_x16_base point to the current coordinates for this block
-+ ::mc_filter_uv
-+ mov ra31, unif
-+ 
-+@@ -207,9 +215,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+ shl ra_xshift_next, r0, 3
-+ sub r2, unif, r3 # compute offset from frame base u to frame base v
-+ add r0, r0, r3
-+-and rb_x_base_next, r0, ~3
-++and rb_x_next, r0, ~3
-+ mov ra_y_next, r1
-+-add ra_x2_base_next, rb_x_base_next, r2
-++add ra_frame_base_next, rb_x_next, r2
-+ 
-+ # set up VPM write
-+ mov vw_setup, rb28
-+@@ -265,16 +273,16 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+ 
-+ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -297,7 +305,7 @@ mov ra13, ra14       # Delay slot 1
-+ mov ra14, ra15       # Delay slot 2
-+ mov ra15, r0         # Delay slot 3
-+ 
-+-mov rb12,32
-++mov rb12,32 # TODO remove these to make P weighted prediction work properly
-+ mov rb13,6
-+ mov rb14,1
-+ mov rb15,0
-+@@ -342,7 +350,7 @@ mov vw_addr, unif # start the VDW
-+ # mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-+ 
-+ # At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x_base, ra_x16_base point to the current coordinates for this block
-++# ra_x, ra_x16_base point to the current coordinates for this block
-+ ::mc_filter_uv_b0
-+ mov ra31, unif
-+ 
-+@@ -357,9 +365,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+ shl ra_xshift_next, r0, 3
-+ sub r2, unif, r3 # compute offset from frame base u to frame base v
-+ add r0, r0, r3
-+-and rb_x_base_next, r0, ~3
-++and rb_x_next, r0, ~3
-+ mov ra_y_next, r1
-+-add ra_x2_base_next, rb_x_base_next, r2
-++add ra_frame_base_next, rb_x_next, r2
-+ 
-+ # set up VPM write, we need to save 16bit precision
-+ mov vw_setup, rb21
-+@@ -408,16 +416,16 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+ 
-+ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -477,9 +485,9 @@ min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+ shl ra_xshift_next, r0, 3
-+ sub r2, unif, r3 # compute offset from frame base u to frame base v
-+ add r0, r0, r3
-+-and rb_x_base_next, r0, ~3
-++and rb_x_next, r0, ~3
-+ mov ra_y_next, r1
-+-add ra_x2_base_next, rb_x_base_next, r2
-++add ra_frame_base_next, rb_x_next, r2
-+ 
-+ # set up VPM write
-+ mov vw_setup, rb28
-+@@ -538,16 +546,16 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+ 
-+ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_x2_base, r2
-++add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++add t0s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -642,5 +650,511 @@ nop        ; nop ; thrend
-+ mov interrupt, 1; nop # delay slot 1
-+ nop        ; nop # delay slot 2
-+ 
-++
-++
-++
-++
-++# LUMA CODE
-++
-++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-++# For P frames we make the second x,y coordinates offset by +8
-++
-++################################################################################
-++# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
-++::mc_setup
-++
-++# Read starting kernel
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++mov ra31, unif
-++
-++# Compute base address for first and second access
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl ra_xshift_next, r0, 3 # Compute shifts
-++add ra_y, r1, 1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-++max r1, r1, 0
-++min r1, r1, rb_frame_height_minus_1
-++nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-++add t0s, r2, r1 ; mov ra_frame_base, r2
-++
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl rx_xshift2_next, r0, 3 # Compute shifts
-++add ra_y2, r1, 1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-++max r1, r1, 0
-++min r1, r1, rb_frame_height_minus_1
-++nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-++add t0s, r2, r1 ; mov ra_frame_base2, r2
-++
-++
-++# Read image dimensions
-++sub rb25,unif,1
-++sub rb30,unif,1
-++
-++# get source pitch
-++mov rb16, unif
-++
-++# get destination pitch
-++mov r0, unif
-++mov r1, vdw_setup_1(0)
-++add rb24, r1, r0
-++
-++# load constants
-++
-++mov ra20, 1
-++mov ra22, 256
-++mov ra30, 64
-++
-++mov rb20, 0xffffff00
-++mov rb22, 255
-++mov rb23, 24
-++
-++# touch vertical context to keep simulator happy
-++
-++mov ra8, 0
-++mov ra9, 0
-++mov ra10, 0
-++mov ra11, 0
-++mov ra12, 0
-++mov ra13, 0
-++mov ra14, 0
-++mov ra15, 0
-++
-++# Compute part of VPM to use for DMA output
-++mov r2, qpu_num
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++shl r0, r0, 5
-++add rb27, r0, r1
-++
-++# Compute part of VPM to save data into
-++mov r2, qpu_num   # qpu_num = abcd
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-++add rb28, r0, r1
-++
-++mov rb12,unif # offset before shift
-++mov rb13,unif # shift
-++
-++# Dump padding words
-++mov r0, unif
-++
-++# submit texture requests for second line
-++max r1, ra_y, 0
-++min r1, r1, rb_frame_height_minus_1
-++add ra_y, ra_y, 1
-++nop ; mul24 r1, r1, rb_pitch
-++add t0s, r1, ra_frame_base
-++
-++max r1, ra_y2, 0
-++min r1, r1, rb_frame_height_minus_1
-++bra -, ra31
-++add ra_y2, ra_y2, 1           # Delay 1
-++nop ; mul24 r1, r1, rb_pitch  # Delay 2
-++add t0s, r1, ra_frame_base2   # Delay 3
-++
-++
-++################################################################################
-++
-++# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-++# In a P block, only the first half of coefficients contain used information.
-++# At this point we have already issued two pairs of texture requests for the current block
-++# ra_x, ra_x16_base point to the current coordinates for this block
-++::mc_filter
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov rx_xshift2, rx_xshift2_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl ra_xshift_next, r0, 3 # Compute shifts
-++mov ra_y_next, r1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-++
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0   ; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl rx_xshift2_next, r0, 3 # Compute shifts
-++add ra_y2_next, r1, 1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-++
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++# get filter coefficients and discard unused B frame values
-++mov r0, unif
-++mov.ifnz -, unif # Alternate coefficients are unused for P frames
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++mov.ifnz -, unif
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++asr ra4, r0, rb23;      mov r0, unif
-++mov.ifnz -, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++mov.ifnz -, unif
-++asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++asr rb4, r0, rb23
-++
-++mov r0, unif # Frame0 offset/weight
-++mov.ifnz -, unif # Frame1 offset/weight unused
-++asr rb15, r0, r2  # Compute offset from MSBs
-++shl r0, r0, r2
-++asr rb14, r0, r2  # Compute weight from LSBs
-++
-++# r3 is loop counter
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:yloop
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++# If we knew there was no clipping then this code would get simpler.
-++# Perhaps we could add on the pitch and clip using larger values?
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, rx_xshift2
-++mov.ifz ra_y2, ra_y2_next
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-++
-++max r2, ra_y2, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# apply horizontal filter
-++nop                  ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 8    ; mov ra12, ra13
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++brr.anyn -, r:yloop
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-++add r1, r1, r0          ; mul24 r0, ra8, rb4
-++add r1, r1, r0          ; mul24 r0, ra9, rb5
-++add r1, r1, r0          ; mul24 r0, ra10, rb6
-++add r1, r1, r0          ; mul24 r0, ra11, rb7
-++
-++add r1, r1, r0          ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 14
-++nop                     ; mul24 r1, r1, rb14
-++add r1, r1, rb12
-++asr r1, r1, rb13
-++brr.anyn -, r:yloop
-++add r1, r1, rb15       # Delay 1
-++min r1, r1, rb22       # Delay 2
-++max vpm, r1, 0         # Delay 3
-++
-++# DMA out
-++
-++bra -, ra31
-++mov vw_setup, rb26 # VDW setup 0    Delay 1
-++mov vw_setup, rb29 # Stride         Delay 2
-++mov vw_addr, unif # start the VDW   Delay 3
-++
-++
-++
-++################################################################################
-++
-++# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-++# In a P block, only the first half of coefficients contain used information.
-++# At this point we have already issued two pairs of texture requests for the current block
-++# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-++# Can fill in the coefficients so only
-++# Can also assume default weighted prediction for B frames.
-++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
-++# Or possibly by taking advantage of symmetry?
-++# From 19->7 32bits per command.
-++::mc_filter_b
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++mov ra31, unif
-++
-++# per-channel shifts were calculated on the *previous* invocation
-++
-++mov ra_xshift, ra_xshift_next
-++mov rx_xshift2, rx_xshift2_next
-++
-++# get base addresses and per-channel shifts for *next* invocation
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl ra_xshift_next, r0, 3 # Compute shifts
-++mov ra_y_next, r1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-++
-++add r0, unif, elem_num # Load x
-++max r0, r0, 0   ; mov r1, unif # Load y
-++min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++shl rx_xshift2_next, r0, 3 # Compute shifts
-++add ra_y2_next, r1, 1
-++and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-++
-++
-++# set up VPM write
-++mov vw_setup, rb28
-++
-++# get width,height of block
-++mov r2, 16
-++mov r0, unif
-++shr r1, r0, r2 # Extract width
-++sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++and r0, r0, rb22 # Extract height
-++add rb17, r0, 5
-++add rb18, r0, 7
-++shl r0, r0, 7
-++add r0, r0, r1 # Combine width and height of destination area
-++shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-++add rb26, r0, rb27
-++
-++# get filter coefficients and discard unused B frame values
-++mov r0, unif
-++mov r1, 1
-++mov.ifnz r0, unif # Alternate coefficients are unused for P frames
-++nop              ;      mul24 r0, r0 << 13, r1 << 13
-++asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 14, r1 << 14
-++asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
-++asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++asr ra0, r0, rb23;      mov r0, unif
-++mov.ifnz r0, unif
-++nop              ;      mul24 r0, r0 << 9, r1 << 9
-++asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 10, r1 << 10
-++asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 11, r1 << 11
-++asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++nop              ;      mul24 r0, r0 << 12, r1 << 12
-++asr ra4, r0, rb23;      mov r0, unif
-++mov.ifnz r0, unif
-++asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++asr rb8, r0, rb23;      mov r0, unif
-++mov.ifnz r0, unif
-++asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++asr rb4, r0, rb23
-++
-++mov r0, unif # Frame0 offset/weight
-++mov.ifnz r0, unif # Frame1 offset/weight unused
-++asr rb15, r0, r2  # Compute offset from MSBs
-++shl r0, r0, r2
-++asr rb14, r0, r2  # Compute weight from LSBs
-++
-++# r3 is loop counter
-++
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++mov r3, 0
-++
-++:yloopb
-++# retrieve texture results and pick out bytes
-++# then submit two more texture requests
-++
-++# If we knew there was no clipping then this code would get simpler.
-++# Perhaps we could add on the pitch and clip using larger values?
-++
-++sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++shr r1, r4, rx_xshift2
-++mov.ifz ra_y2, ra_y2_next
-++
-++max r2, ra_y, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-++
-++max r2, ra_y2, 0  # y
-++min r2, r2, rb_frame_height_minus_1
-++add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++
-++
-++# generate seven shifted versions
-++# interleave with scroll of vertical context
-++
-++mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++
-++# apply horizontal filter
-++nop                  ; mul24 r2, r0, ra0
-++nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++add r0, r2, r3       ; mov r3, rb31
-++sub.setf -, r3, 8    ; mov ra12, ra13
-++mov ra9, ra10
-++mov ra10, ra11
-++mov ra11, ra12
-++mov ra12, ra13
-++brr.anyn -, r:yloopb
-++mov ra13, ra14       # Delay slot 1
-++mov ra14, ra15       # Delay slot 2
-++mov ra15, r0         # Delay slot 3
-++
-++# apply vertical filter and write to VPM
-++
-++nop                     ; mul24 r1, ra14, rb10
-++nop                     ; mul24 r0, ra13, rb9
-++add r1, r1, r0          ; mul24 r0, ra12, rb8
-++add r1, r1, r0          ; mul24 r0, ra15, rb11
-++add r1, r1, r0          ; mul24 r0, ra8, rb4
-++add r1, r1, r0          ; mul24 r0, ra9, rb5
-++add r1, r1, r0          ; mul24 r0, ra10, rb6
-++add r1, r1, r0          ; mul24 r0, ra11, rb7
-++
-++add r1, r1, r0          ; mov -, vw_wait
-++sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++asr r1, r1, 14
-++nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
-++add r1, r1, ra30        ; mul24 r0, r1, rb14
-++add r1, r1, r0
-++brr.anyn -, r:yloopb
-++asr r1, r1, 7          # Delay 1
-++min r1, r1, rb22       # Delay 2
-++max vpm, r1, 0         # Delay 3
-++
-++# DMA out
-++bra -, ra31
-++mov vw_setup, rb26 # VDW setup 0    Delay 1
-++mov vw_setup, rb29 # Stride         Delay 2
-++mov vw_addr, unif # start the VDW   Delay 3
-++
-++################################################################################
-++
-++# mc_interrupt_exit12()
-++::mc_interrupt_exit12
-++mov  -, vw_wait # wait on the VDW
-++
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++ldtmu0
-++
-++mov -,sacq(0) # 1
-++mov -,sacq(0) # 2
-++mov -,sacq(0) # 3
-++mov -,sacq(0) # 4
-++mov -,sacq(0) # 5
-++mov -,sacq(0) # 6
-++mov -,sacq(0) # 7
-++mov -,sacq(0) # 8
-++mov -,sacq(0) # 9
-++mov -,sacq(0) # 10
-++mov -,sacq(0) # 11
-++
-++nop        ; nop ; thrend
-++mov interrupt, 1; nop # delay slot 1
-++nop        ; nop # delay slot 2
-++
-++
-+ ::mc_end
-+ # Do not add code here because mc_end must appear after all other code.
-+-- 
-+2.5.0
-+
-+
-+From 3a5492970d13bf5ffe94898d59b3e882e7c8a1f5 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 20 May 2015 19:58:30 +0100
-+Subject: [PATCH 42/68] Added support for fast cache flush in deblocker
-+
-+---
-+ libavcodec/hevc_filter.c   |   44 +-
-+ libavcodec/rpi_qpu.c       |    6 +
-+ libavcodec/rpi_qpu.h       |    2 +
-+ libavcodec/rpi_shader.c    | 1028 +++++++++++++++++++++++++++++---------------
-+ libavcodec/rpi_shader.h    |   16 +-
-+ libavcodec/rpi_user_vcsm.h |   22 +
-+ 6 files changed, 768 insertions(+), 350 deletions(-)
-+
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 92a8271..186317a 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -37,6 +37,11 @@
-+ 
-+ #include "bit_depth_template.c"
-+ 
-++#ifdef RPI
-++#include "rpi_user_vcsm.h"
-++#include "rpi_qpu.h"
-++#endif
-++
-+ #define LUMA 0
-+ #define CB 1
-+ #define CR 2
-+@@ -872,15 +877,46 @@ static void flush_buffer(AVBufferRef *bref) {
-+     gpu_cache_flush(p);
-+ }
-+ 
-+-static void ff_hevc_flush_chroma(HEVCContext *s)
-++// Return Physical address for this image
-++static int ff_hevc_buf_base(AVBufferRef *bref) {
-++  GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++  return p->vc & 0x3fffffff;
-++}
-++
-++static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+             s->nal_unit_type == NAL_TSA_N   ||
-+             s->nal_unit_type == NAL_STSA_N  ||
-+             s->nal_unit_type == NAL_RADL_N  ||
-+             s->nal_unit_type == NAL_RASL_N )) {
-++#define RPI_FAST_CACHEFLUSH
-++#ifdef RPI_FAST_CACHEFLUSH
-++        struct vcsm_user_clean_invalid_s iocache = {};
-++        int curr_y = f->progress->data[0];
-++        int sz,base;
-++        if (curr_y < 0) curr_y = 0;
-++        if (n<=curr_y) return; // Should not happen
-++        sz = s->frame->linesize[1] * (n-curr_y);
-++        base = s->frame->linesize[1] * curr_y;
-++        iocache.s[0].cmd = 3; // Flush L1 cache
-++        iocache.s[0].addr = 0;
-++        iocache.s[0].size  = 0;
-++
-++        iocache.s[1].cmd = 2;
-++        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
-++        iocache.s[1].size  = sz;
-++
-++        iocache.s[2].cmd = 2;
-++        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
-++        iocache.s[2].size  = sz;
-++
-++        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
-++
-++#else
-+         flush_buffer(s->frame->buf[1]);
-+         flush_buffer(s->frame->buf[2]);
-++#endif
-+         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+         //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
-+@@ -903,7 +939,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x, y - ctb_size);
-+             if (s->threads_type & FF_THREAD_FRAME ) {
-+ #ifdef RPI_INTER_QPU
-+-                ff_hevc_flush_chroma(s);
-++                ff_hevc_flush_chroma(s,&s->ref->tf, y);
-+ #endif
-+                 ff_thread_report_progress(&s->ref->tf, y, 0);
-+             }
-+@@ -912,7 +948,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x , y);
-+             if (s->threads_type & FF_THREAD_FRAME ) {
-+ #ifdef RPI_INTER_QPU
-+-                ff_hevc_flush_chroma(s);
-++                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
-+ #endif
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+             }
-+@@ -922,7 +958,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-+ #ifdef RPI_INTER_QPU
-+-        ff_hevc_flush_chroma(s);
-++        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index f62051f..fd8a276 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -237,6 +237,12 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+   return r;
-+ }
-+ 
-++int gpu_get_mailbox(void)
-++{
-++  assert(gpu);
-++  return gpu->mb;
-++}
-++
-+ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ {
-+   void *tmp = vcsm_lock(p->vcsm_handle);
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 543c84b..88965e5 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -49,4 +49,6 @@ extern int rpi_test_shader(void);
-+ extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
-+ extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
-+ 
-++extern int gpu_get_mailbox(void);
-++
-+ #endif
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 3f04d80..9c30e32 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -23,11 +23,11 @@ __attribute__((aligned(8)))
-+ unsigned int rpi_shader[] = {
-+ // ::mc_setup_uv
-+ /* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x_base, unif, elem_num
-++/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
-+ /* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+-/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_x2_base, unif
-++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
-+ /* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_x2_base
-++/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
-+ /* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+ /* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+ /* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+@@ -35,360 +35,708 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+ /* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+ /* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000068] */ 0x00000020, 0xe0020567, // mov ra21, 32
-+-/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x00000078] */ 0x00000008, 0xe00205e7, // mov ra23, 8
-+-/* [0x00000080] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+-/* [0x00000088] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000090] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000098] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x000000a0] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x000000a8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x000000b0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x000000b8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x000000c0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x000000c8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x000000d0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x000000d8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000e0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000000e8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x000000f0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000000f8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000100] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000110] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000118] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000120] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000148] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x00000150] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000158] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000160] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000168] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000170] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000178] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000180] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000188] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000190] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000198] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+-/* [0x000001a0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+-/* [0x000001a8] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+-/* [0x000001b0] */ 0x15427d80, 0x10020827, // mov r0, ra_x_base
-+-/* [0x000001b8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000001c0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_x2_base
-+-/* [0x000001c8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001d0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001d8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001e0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001e8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x_base, r0
-+-/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001f8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x00000200] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_x2_base, r2
-+-/* [0x00000208] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000210] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000218] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000220] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000228] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000230] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000238] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000240] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000248] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000250] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x_base
-+-/* [0x00000258] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_x2_base
-++/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000070] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000078] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000080] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000088] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000090] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000098] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x000000a0] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x000000a8] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x000000b0] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-++/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-++/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-++/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-++/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-++/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-++/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-++/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+ // ::mc_filter_uv
-+-/* [0x00000260] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000268] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000270] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000278] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000280] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000288] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000290] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000298] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000002a0] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000002a8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002b0] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000002b8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002c0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002d0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002d8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002e0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002e8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002f0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002f8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000300] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000308] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000310] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000340] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000350] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000358] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000360] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000370] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000380] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000388] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000390] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+-/* [0x00000398] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000003a0] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+-/* [0x000003a8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000003b0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-++/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-++/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003b8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003c0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x000003c8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x000003d0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003d8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003e0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003e8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003f0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003f8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000400] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000408] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000410] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000418] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000420] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000428] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000430] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000438] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000440] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000448] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000458] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000460] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000468] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000470] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000478] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000480] */ 0x00000020, 0xe0021327, // mov rb12,32
-+-/* [0x00000488] */ 0x00000006, 0xe0021367, // mov rb13,6
-+-/* [0x00000490] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+-/* [0x00000498] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+-/* [0x000004a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000004a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000004b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000004b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000004c0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000004c8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000004d0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000004d8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x000004e0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x000004e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x000004f0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004f8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x00000500] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000508] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000510] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000518] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000520] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000528] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000530] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000538] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000540] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000548] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000550] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
-++/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
-++/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
-++/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
-++/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x00000558] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000560] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000568] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000570] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000578] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000580] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000588] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000590] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000598] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x000005a0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000005a8] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x000005b0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x000005b8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000005c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005c8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000005d0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000005d8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000005e0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000005e8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000005f0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000005f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000600] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000608] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000610] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000618] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000620] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000628] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000630] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000638] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000640] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000648] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000650] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000658] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000660] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000668] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000670] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000678] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000680] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000688] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000690] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000698] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000006a0] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000006a8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000006b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000006b8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000006c0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x000006c8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000006d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000006e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000006e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000700] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000708] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000710] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000718] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000720] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000728] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000730] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000738] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000740] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000748] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000750] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000758] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000760] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000768] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000770] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x00000778] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000780] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x00000788] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000798] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x000007a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007b0] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x000007b8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000007c0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007c8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007d0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007d8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007e8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007f0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007f8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_base_next, r0, ~3
-+-/* [0x00000800] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000808] */ 0x0c9d3e80, 0x100206a7, // add ra_x2_base_next, rb_x_base_next, r2
-+-/* [0x00000810] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000818] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000820] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000828] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000830] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000838] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000840] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000848] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000850] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000858] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000860] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000868] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000870] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000878] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000880] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000888] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000890] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000898] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008b0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000008b8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008c0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008c8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008d0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x000008d8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008e8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008f8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x00000900] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000908] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x_base, rb_x_base_next       ; ldtmu0
-+-/* [0x00000910] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_x2_base, ra_x2_base_next ; mov rb31, r3
-+-/* [0x00000918] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000920] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000928] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000930] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000938] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000940] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x_base, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000948] */ 0x0c627c80, 0x10020e27, // add t0s, ra_x2_base, r2
-+-/* [0x00000950] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000958] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000960] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000968] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000970] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000978] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000980] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000988] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000990] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000998] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x000009a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x000009a8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009b0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009b8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000009c0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000009c8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009d0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009d8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009e0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x00000a00] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x00000a08] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000a10] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a18] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a20] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a28] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a30] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a38] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a40] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a48] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a50] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a58] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a60] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a68] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a70] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a78] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a80] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a98] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000aa0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000ab0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ae0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b10] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b18] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b20] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b28] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b30] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_setup
-++/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-++/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-++/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-++/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-++/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-++/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-++// ::mc_filter
-++/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-++/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-++/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-++/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-++/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-++/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :yloop
-++/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-++/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_filter_b
-++/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-++/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-++/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-++/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-++/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
-++/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-++/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-++/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-++/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-++/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-++/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-++/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-++/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-++/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++// :yloopb
-++/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-++/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-++/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-++/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++// ::mc_interrupt_exit12
-++/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index cec9901..3fa8531 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,11 +4,15 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 152)
-+-#define mc_filter_uv_b0 (rpi_shader + 342)
-+-#define mc_filter_uv_b (rpi_shader + 494)
-+-#define mc_exit (rpi_shader + 670)
-+-#define mc_interrupt_exit8 (rpi_shader + 688)
-+-#define mc_end (rpi_shader + 718)
-++#define mc_filter_uv (rpi_shader + 148)
-++#define mc_filter_uv_b0 (rpi_shader + 338)
-++#define mc_filter_uv_b (rpi_shader + 490)
-++#define mc_exit (rpi_shader + 666)
-++#define mc_interrupt_exit8 (rpi_shader + 684)
-++#define mc_setup (rpi_shader + 714)
-++#define mc_filter (rpi_shader + 868)
-++#define mc_filter_b (rpi_shader + 1108)
-++#define mc_interrupt_exit12 (rpi_shader + 1364)
-++#define mc_end (rpi_shader + 1402)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-+index fbebbbe..95e6de1 100644
-+--- a/libavcodec/rpi_user_vcsm.h
-++++ b/libavcodec/rpi_user_vcsm.h
-+@@ -418,6 +418,28 @@ int vcsm_unlock_hdl( unsigned int handle );
-+ */
-+ int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+ 
-++/* Clean and/or invalidate the memory associated with this user opaque handle
-++**
-++** Returns:        non-zero on error
-++**
-++** structure contains a list of flush/invalidate commands. Commands are:
-++** 0: nop
-++** 1: invalidate given physical range in L2
-++** 2: clean      given physical range in L2
-++** 3: clean+invalidate all of L1
-++** 4: flush      all of L2 and all of L1
-++*/
-++struct vcsm_user_clean_invalid_s {
-++    struct {
-++       unsigned int cmd;
-++       unsigned int addr;
-++       unsigned int size;
-++    } s[8];
-++};
-++
-++int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
-++
-++
-+ #ifdef __cplusplus
-+ }
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From 855a757b894c53ddeadeb94c74bd11d3b2fa1fd3 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 20 May 2015 21:12:55 +0100
-+Subject: [PATCH 43/68] Added multi mailbox - not working
-+
-+---
-+ libavcodec/hevc.c        | 40 ++++++++++++++++++++++++++++---
-+ libavcodec/rpi_mailbox.c | 47 +++++++++++++++++++++++++++++++++++++
-+ libavcodec/rpi_mailbox.h |  5 ++++
-+ libavcodec/rpi_qpu.c     | 61 ++++++++++++++++++++++++++++++++++++++++++++----
-+ libavcodec/rpi_qpu.h     |  2 ++
-+ 5 files changed, 147 insertions(+), 8 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 1cc7900..9bf0d28 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -45,6 +45,11 @@
-+   //#define EARLY_MALLOC
-+   // Move Inter prediction into separate pass
-+   #define RPI_INTER
-++
-++  #ifdef RPI_INTER_QPU
-++    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-++    #define RPI_MULTI_MAILBOX
-++  #endif
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -2830,10 +2835,14 @@ static void rpi_inter_clear(HEVCContext *s)
-+ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ {
-+     int k;
-++    int i;
-+     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-+-
-+-    if (s->sh.slice_type == I_SLICE)
-+-        return;
-++    if (s->sh.slice_type == I_SLICE) {
-++#ifdef RPI_MULTI_MAILBOX
-++      rpi_execute_transform(s);
-++      return;
-++#endif
-++    }
-+     for(k=0;k<8;k++) {
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+@@ -2843,6 +2852,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ 
-++#ifdef RPI_MULTI_MAILBOX
-++    gpu_cache_flush(&s->coeffs_buf_accelerated);
-++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-++                                   qpu_get_fn(QPU_MC_SETUP_UV),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++                                 );
-++    for(i=0;i<4;i++)
-++        s->num_coeffs[i] = 0;
-++#else
-+     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+       (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+       (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+@@ -2853,6 +2878,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+       (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+       );
-++#endif
-+ }
-+ #endif
-+ 
-+@@ -2932,6 +2958,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+             // Transform all blocks
-+             // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-++#ifdef RPI_MULTI_MAILBOX
-++            // Kick off inter prediction on QPUs
-++            rpi_execute_inter_qpu(s);
-++            // Perform luma inter prediction
-++            rpi_execute_inter_cmds(s);
-++#else
-+             rpi_execute_transform(s);
-+             // Perform inter prediction
-+             rpi_execute_inter_cmds(s);
-+@@ -2939,6 +2971,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+             // Kick off inter prediction on QPUs
-+             rpi_execute_inter_qpu(s);
-+ #endif
-++#endif
-++
-+             // Wait for transform completion
-+             vpu_wait(s->vpu_id);
-+ 
-+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-+index 77a56dd..3904efc 100644
-+--- a/libavcodec/rpi_mailbox.c
-++++ b/libavcodec/rpi_mailbox.c
-+@@ -276,6 +276,53 @@ unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigne
-+    return p[5];
-+ }
-+ 
-++void execute_multi(int file_desc,
-++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
-++   int i=0;
-++   unsigned p[32];
-++
-++   p[i++] = 0; // size
-++   p[i++] = 0x00000000; // process request
-++   p[i++] = 0x30018; // (the tag id)
-++   p[i++] = 88; // (size of the buffer)
-++   p[i++] = 88; // (size of the data)
-++
-++   p[i++] = num_qpus;
-++   p[i++] = control;
-++   p[i++] = noflush;
-++   p[i++] = timeout; // ms
-++
-++   p[i++] = num_qpus_2;
-++   p[i++] = control_2;
-++   p[i++] = noflush_2;
-++   p[i++] = timeout_2; // ms
-++
-++   p[i++] = code;
-++   p[i++] = r0;
-++   p[i++] = r1;
-++   p[i++] = r2;
-++   p[i++] = r3;
-++   p[i++] = r4;
-++   p[i++] = r5;
-++
-++   p[i++] = code_2;
-++   p[i++] = r0_2;
-++   p[i++] = r1_2;
-++   p[i++] = r2_2;
-++   p[i++] = r3_2;
-++   p[i++] = r4_2;
-++   p[i++] = r5_2;
-++
-++   p[i++] = 0x00000000; // end tag
-++   p[0] = i*sizeof *p; // actual size
-++
-++   mbox_property(file_desc, p);
-++   return;
-++}
-++
-+ int mbox_open() {
-+    int file_desc;
-+ 
-+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-+index c264d2e..5898102 100644
-+--- a/libavcodec/rpi_mailbox.h
-++++ b/libavcodec/rpi_mailbox.h
-+@@ -15,6 +15,11 @@ extern void unmapmem(void *addr, unsigned size);
-+ 
-+ extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+ extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-++extern void execute_multi(int file_desc,
-++   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-++   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-++   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-++   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
-+ extern unsigned qpu_enable(int file_desc, unsigned enable);
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index fd8a276..feb3284 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -123,7 +123,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
-+ static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
-+ static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ 
-+-static int vpu_cmds[MAXCMDS][8];
-++static int vpu_cmds[MAXCMDS][16];
-+ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-+ static volatile int vpu_async_head=0;
-+ #endif
-+@@ -346,6 +346,7 @@ unsigned int vpu_get_constants(void) {
-+ static void *vpu_start(void *arg) {
-+   while(1) {
-+     int *p;
-++    int qpu_code;
-+     pthread_mutex_lock(&post_mutex);
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+@@ -358,12 +359,25 @@ static void *vpu_start(void *arg) {
-+     if (p[6] == -1) {
-+       break; // Last job
-+     }
-+-    if (p[7]) {
-++    qpu_code = p[7];
-++    //if (p[7]) {
-+         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+         //gpu_cache_flush(buf);
-+-    }
-+-    vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-++    //}
-++    if (!qpu_code) {
-++      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-++    } else {
-++      int i;
-++      for(i=0;i<8;i++) {
-++        gpu->mail[i*2] = p[8+i];
-++        gpu->mail[i*2 + 1] = qpu_code;
-++      }
-+ 
-++      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-++                              0, 0, 0, 0,
-++                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-++                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-++    }
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+     pthread_cond_broadcast(&post_cond_head);
-+@@ -400,7 +414,43 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
-+     p[4] = r3;
-+     p[5] = r4;
-+     p[6] = r5;
-+-    p[7] = (int) buf;
-++    p[7] = 0;
-++    if (num<=1)
-++      pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-++    pthread_mutex_unlock(&post_mutex);
-++    return id;
-++  }
-++}
-++
-++int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-++                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-++{
-++
-++  pthread_mutex_lock(&post_mutex);
-++  {
-++    int id = vpu_async_tail++;
-++    int *p = vpu_cmds[id%MAXCMDS];
-++    int num = vpu_async_tail - vpu_async_head;
-++    if (num>MAXCMDS) {
-++      printf("Too many commands submitted\n");
-++      exit(-1);
-++    }
-++    p[0] = vpu_code;
-++    p[1] = r0;
-++    p[2] = r1;
-++    p[3] = r2;
-++    p[4] = r3;
-++    p[5] = r4;
-++    p[6] = r5;
-++    p[7] = qpu_code;
-++    p[8 ] = unifs1;
-++    p[9 ] = unifs2;
-++    p[10] = unifs3;
-++    p[11] = unifs4;
-++    p[12] = unifs5;
-++    p[13] = unifs6;
-++    p[14] = unifs7;
-++    p[15] = unifs8;
-+     if (num<=1)
-+       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-+     pthread_mutex_unlock(&post_mutex);
-+@@ -966,6 +1016,7 @@ void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, i
-+ }
-+ 
-+ 
-++
-+ #endif
-+ 
-+ #endif // RPI
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 88965e5..2f08f03 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -41,6 +41,8 @@ extern unsigned int vpu_get_fn(void);
-+ extern unsigned int vpu_get_constants(void);
-+ extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+ extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-++int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-++                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+ extern void vpu_wait( int id);
-+ 
-+ // Simple test of shader code
-+-- 
-+2.5.0
-+
-+
-+From e576989224bf22d2b945e9ded8b27bafe1bd5417 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 21 May 2015 16:50:02 +0100
-+Subject: [PATCH 44/68] Pass qpu number in as uniform
-+
-+---
-+ libavcodec/hevc.c          |    2 +-
-+ libavcodec/rpi_shader.c    | 1288 ++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |   20 +-
-+ libavcodec/rpi_shader.qasm |   10 +-
-+ 4 files changed, 657 insertions(+), 663 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 9bf0d28..25e1cbd 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -2821,6 +2821,7 @@ static void rpi_inter_clear(HEVCContext *s)
-+         *s->u_mvs[i]++ = pic_height;
-+         *s->u_mvs[i]++ = s->frame->linesize[1];
-+         *s->u_mvs[i]++ = s->frame->linesize[2];
-++        *s->u_mvs[i]++ = i;
-+         if (weight_flag) {
-+             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-+             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-+@@ -2828,7 +2829,6 @@ static void rpi_inter_clear(HEVCContext *s)
-+             *s->u_mvs[i]++ = 1 << 5;
-+             *s->u_mvs[i]++ = 6;
-+         }
-+-        s->u_mvs[i] += 1;  // Padding words
-+     }
-+ }
-+ 
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index 9c30e32..a0f0282 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -48,8 +48,8 @@ unsigned int rpi_shader[] = {
-+ /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+ /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+ /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x000000d8] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
-++/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-+ /* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+ /* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+ /* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+@@ -60,669 +60,669 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+ /* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+ /* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000130] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000138] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-/* [0x00000140] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000148] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000150] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000158] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000160] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000168] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000170] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000178] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000180] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000188] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+-/* [0x00000190] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+-/* [0x00000198] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+-/* [0x000001a0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+-/* [0x000001a8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000001b0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+-/* [0x000001b8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001c0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001c8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001d8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+-/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001e8] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x000001f0] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+-/* [0x000001f8] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x00000200] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000208] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000210] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000218] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000220] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000228] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000230] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000238] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000240] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+-/* [0x00000248] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-++/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-++/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-++/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-++/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-++/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-++/* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-++/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-++/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-++/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+ // ::mc_filter_uv
-+-/* [0x00000250] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000258] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000260] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000268] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000270] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000278] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000280] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000288] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000290] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x00000298] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000002a0] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x000002a8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002b0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002c0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002c8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002d0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002d8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002e0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002e8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002f8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000300] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000308] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000310] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000320] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000330] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000340] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000348] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000350] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000358] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000360] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000368] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000370] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000378] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000380] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+-/* [0x00000388] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000390] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+-/* [0x00000398] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000003a0] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-++/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-++/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x000003a8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003b0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x000003b8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x000003c0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003c8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003d0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003d8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003e0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003e8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003f0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x000003f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000400] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000408] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000410] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000418] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000420] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000428] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000430] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000438] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000448] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000450] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000458] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000460] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000468] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000470] */ 0x00000020, 0xe0021327, // mov rb12,32
-+-/* [0x00000478] */ 0x00000006, 0xe0021367, // mov rb13,6
-+-/* [0x00000480] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+-/* [0x00000488] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+-/* [0x00000490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000004a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000004a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000004b0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000004b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000004c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000004c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x000004d0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x000004d8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x000004e0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004e8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x000004f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000500] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000508] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000510] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000518] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000520] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000528] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000530] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000538] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000540] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
-++/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
-++/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
-++/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
-++/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x00000548] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000550] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000558] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000560] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000568] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000570] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000578] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000580] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000588] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x00000590] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000598] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x000005a0] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x000005a8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000005b0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005b8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000005c0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000005c8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000005d0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000005d8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000005e0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000005e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000005f0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000600] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000608] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000610] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000618] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000620] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000628] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000630] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000638] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000640] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000648] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000650] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000658] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000660] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000668] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000670] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000678] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x00000680] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000688] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000690] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000698] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000006a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000006a8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000006b0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x000006b8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x000006c0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000006c8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000006d0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000006d8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006e0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006e8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000006f0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006f8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000700] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000708] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000710] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000718] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000720] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000728] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000730] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000738] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000740] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000748] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000750] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000758] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000760] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x00000768] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000770] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x00000778] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000780] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000788] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000790] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000798] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000007a0] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x000007a8] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000007b0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007b8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007c0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007c8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007d0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007d8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007e0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007e8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x000007f0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007f8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x00000800] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000808] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000810] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000818] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000820] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000828] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000830] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000838] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000840] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000848] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000850] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000858] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000860] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000868] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000870] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000878] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000880] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000888] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000890] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000898] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x000008a8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008b0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008b8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008c0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x000008c8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008d0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008d8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008e0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008e8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008f8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x00000900] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000908] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000910] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000918] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000920] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000928] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000930] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000938] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x00000940] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000948] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000950] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000958] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000960] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000968] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000970] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000978] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000980] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000988] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000990] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000998] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009a0] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x000009a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000009b0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000009b8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009c0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009c8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009d0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009d8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009e0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009e8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000009f0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x000009f8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x00000a00] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000a08] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a10] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a18] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a20] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a28] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a30] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a38] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a48] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a50] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a58] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a60] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-++/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a70] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a88] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a90] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a98] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000aa0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000aa8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000ab0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ad0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b00] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b08] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b20] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_setup
-+-/* [0x00000b28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000b30] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000b38] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000b40] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000b48] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000b50] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000b58] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x00000b60] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000b68] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+-/* [0x00000b70] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000b78] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000b80] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+-/* [0x00000b88] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+-/* [0x00000b90] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000b98] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000ba0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000ba8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x00000bb0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-+-/* [0x00000bb8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000bc0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+-/* [0x00000bc8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000bd0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000bd8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+-/* [0x00000be0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-+-/* [0x00000be8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00000bf0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x00000bf8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x00000c00] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000c08] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x00000c10] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x00000c18] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000c20] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x00000c28] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+-/* [0x00000c30] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000c38] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000c40] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00000c48] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x00000c50] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x00000c58] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x00000c60] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x00000c68] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x00000c70] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x00000c78] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x00000c80] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x00000c88] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000c90] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000c98] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000ca0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000ca8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000cb0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000cb8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000cc0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000cc8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000cd0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000cd8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000ce0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000ce8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000cf0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000cf8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000d00] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000d08] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000d10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000d18] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000d20] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000d28] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000d30] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000d38] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000d40] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000d48] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000d50] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000d58] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+-/* [0x00000d60] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-+-/* [0x00000d68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000d70] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000d78] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-+-/* [0x00000d80] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000d88] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-++/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-++/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-++/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-++/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-++/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-++/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-++/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
-++/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
-++/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-++/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-++/* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-+ // ::mc_filter
-+-/* [0x00000d90] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000d98] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000da0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000da8] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+-/* [0x00000db0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000db8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000dc0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000dc8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000dd0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000dd8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000de0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+-/* [0x00000de8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000df0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+-/* [0x00000df8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000e00] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x00000e08] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+-/* [0x00000e10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000e18] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+-/* [0x00000e20] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000e28] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000e30] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e38] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000e40] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000e48] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000e50] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000e58] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000e60] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000e68] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000e70] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000e78] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000e80] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e88] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000e90] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000e98] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ea0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ea8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000eb0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000eb8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ec0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ec8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ed0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000ed8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000ee0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000ee8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000ef0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ef8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000f00] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000f08] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f10] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f18] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f20] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+-/* [0x00000f28] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000f30] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000f38] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000f40] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000f48] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000f50] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-++/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-++/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-++/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-++/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-++/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-++/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :yloop
-+-/* [0x00000f58] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-/* [0x00000f60] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-/* [0x00000f68] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000f70] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000f78] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+-/* [0x00000f80] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+-/* [0x00000f88] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000f90] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000f98] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-/* [0x00000fa0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+-/* [0x00000fa8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+-/* [0x00000fb0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000fb8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-/* [0x00000fc0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+-/* [0x00000fc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000fd0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000fd8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000fe0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000fe8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000ff0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000ff8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00001000] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00001008] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00001010] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00001018] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00001020] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00001028] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00001030] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001038] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001040] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001048] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001050] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001058] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+-/* [0x00001060] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001068] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001070] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001078] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001080] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-+-/* [0x00001088] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001090] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00001098] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000010a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000010a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000010b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000010b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000010c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-/* [0x000010c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-/* [0x000010d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-/* [0x000010d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-/* [0x000010e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000010e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000010f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000010f8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x00001100] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x00001108] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x00001110] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-+-/* [0x00001118] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x00001120] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00001128] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00001130] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001138] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001140] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001148] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-++/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_b
-+-/* [0x00001150] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001158] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001160] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00001168] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+-/* [0x00001170] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00001178] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00001180] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00001188] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00001190] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00001198] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000011a0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+-/* [0x000011a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000011b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+-/* [0x000011b8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x000011c0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x000011c8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+-/* [0x000011d0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000011d8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+-/* [0x000011e0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000011e8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000011f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000011f8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00001200] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00001208] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00001210] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00001218] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00001220] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00001228] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001230] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001238] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00001240] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001248] */ 0x00000001, 0xe0020867, // mov r1, 1
-+-/* [0x00001250] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001258] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-+-/* [0x00001260] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001268] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-+-/* [0x00001270] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001278] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-+-/* [0x00001280] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001288] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00001290] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001298] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-+-/* [0x000012a0] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012a8] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-+-/* [0x000012b0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012b8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-+-/* [0x000012c0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012c8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-+-/* [0x000012d0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000012d8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x000012e0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000012e8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000012f0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012f8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00001300] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001308] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001310] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001318] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001320] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+-/* [0x00001328] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001330] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001338] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00001340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001348] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00001350] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-++/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-++/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-++/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-++/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
-++/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-++/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-++/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-++/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-++/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-++/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-++/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-++/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-++/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-++/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-++/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-++/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-++/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :yloopb
-+-/* [0x00001358] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-/* [0x00001360] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-/* [0x00001368] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00001370] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00001378] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+-/* [0x00001380] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+-/* [0x00001388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00001390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00001398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-/* [0x000013a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+-/* [0x000013a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+-/* [0x000013b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000013b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-/* [0x000013c0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+-/* [0x000013c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000013d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000013d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000013e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000013e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000013f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000013f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00001400] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00001408] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00001410] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00001418] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00001420] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00001428] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00001430] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001438] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001440] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001448] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001450] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001458] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+-/* [0x00001460] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001468] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001470] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001478] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001480] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-+-/* [0x00001488] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001490] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00001498] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000014a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000014a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000014b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000014b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000014c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-/* [0x000014c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-/* [0x000014d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-/* [0x000014d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-/* [0x000014e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000014e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000014f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000014f8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-+-/* [0x00001500] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-+-/* [0x00001508] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00001510] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-+-/* [0x00001518] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00001520] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00001528] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00001530] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001538] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001540] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001548] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-++/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-++/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-++/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-++/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_interrupt_exit12
-+-/* [0x00001550] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+ /* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001568] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001570] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+@@ -732,11 +732,9 @@ unsigned int rpi_shader[] = {
-+ /* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+ /* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000015d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000015e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 3fa8531..6e552d9 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,15 +4,15 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 148)
-+-#define mc_filter_uv_b0 (rpi_shader + 338)
-+-#define mc_filter_uv_b (rpi_shader + 490)
-+-#define mc_exit (rpi_shader + 666)
-+-#define mc_interrupt_exit8 (rpi_shader + 684)
-+-#define mc_setup (rpi_shader + 714)
-+-#define mc_filter (rpi_shader + 868)
-+-#define mc_filter_b (rpi_shader + 1108)
-+-#define mc_interrupt_exit12 (rpi_shader + 1364)
-+-#define mc_end (rpi_shader + 1402)
-++#define mc_filter_uv (rpi_shader + 144)
-++#define mc_filter_uv_b0 (rpi_shader + 334)
-++#define mc_filter_uv_b (rpi_shader + 486)
-++#define mc_exit (rpi_shader + 662)
-++#define mc_interrupt_exit8 (rpi_shader + 680)
-++#define mc_setup (rpi_shader + 710)
-++#define mc_filter (rpi_shader + 864)
-++#define mc_filter_b (rpi_shader + 1104)
-++#define mc_interrupt_exit12 (rpi_shader + 1360)
-++#define mc_end (rpi_shader + 1398)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 9cfc0d9..a0b8e5a 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -133,8 +133,8 @@ mov ra14, 0
-+ mov ra15, 0
-+ 
-+ # Compute part of VPM to use for DMA output
-+-mov r2, qpu_num
-+-shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-++mov r3, unif
-++shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+ and r2, r2, 15
-+ mov r1, r2
-+ asr r1, r1, 2
-+@@ -147,8 +147,7 @@ shl r0, r0, 5
-+ add rb27, r0, r1
-+ 
-+ # Compute part of VPM to save data into
-+-mov r2, qpu_num   # qpu_num = abcd
-+-shl r2, r2, 1
-++shl r2, r3, 1
-+ and r2, r2, 15    # r2 = bcd0
-+ mov r1, r2        # r1 = bcd0
-+ asr r1, r1, 2     # r1 = bc
-+@@ -181,9 +180,6 @@ add t0s, r2, r1
-+ mov rb12,unif # offset before shift
-+ mov rb13,unif # offset after shift
-+ 
-+-# Dump padding words
-+-mov r0, unif
-+-
-+ # submit texture requests for second line
-+ max r1, ra_y, 0
-+ min r1, r1, rb_frame_height_minus_1
-+-- 
-+2.5.0
-+
-+
-+From 2372b3e0797cfce130103357085d21baecb0d5a8 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Sat, 23 May 2015 13:20:21 +0100
-+Subject: [PATCH 45/68] Add new cache flushing routine
-+
-+---
-+ libavcodec/hevc.c          |  8 +++--
-+ libavcodec/hevc_filter.c   | 39 ++++++++++-----------
-+ libavcodec/rpi_qpu.c       | 17 +++++++--
-+ libavcodec/rpi_qpu.h       |  2 ++
-+ libavcodec/rpi_user_vcsm.h | 86 ++++++++++++++++++++++++++--------------------
-+ 5 files changed, 91 insertions(+), 61 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 25e1cbd..31bbf67 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3498,9 +3498,13 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
-+     }
-+ 
-+ fail:
-+-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
-++    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-++#ifdef RPI_INTER_QPU
-++        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-++        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
-++#endif
-+         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-+-
-++    }
-+     return ret;
-+ }
-+ 
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 186317a..ec84e8a 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -883,36 +883,35 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
-+   return p->vc & 0x3fffffff;
-+ }
-+ 
-+-static void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-++void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-++void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+             s->nal_unit_type == NAL_TSA_N   ||
-+             s->nal_unit_type == NAL_STSA_N  ||
-+             s->nal_unit_type == NAL_RADL_N  ||
-+             s->nal_unit_type == NAL_RASL_N )) {
-+-#define RPI_FAST_CACHEFLUSH
-+ #ifdef RPI_FAST_CACHEFLUSH
-+         struct vcsm_user_clean_invalid_s iocache = {};
-+-        int curr_y = f->progress->data[0];
-++        int curr_y = ((int *)f->progress->data)[0];
-++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-++        int n_uv = n >> s->ps.sps->vshift[1];
-+         int sz,base;
-+-        if (curr_y < 0) curr_y = 0;
-+-        if (n<=curr_y) return; // Should not happen
-+-        sz = s->frame->linesize[1] * (n-curr_y);
-+-        base = s->frame->linesize[1] * curr_y;
-+-        iocache.s[0].cmd = 3; // Flush L1 cache
-+-        iocache.s[0].addr = 0;
-+-        iocache.s[0].size  = 0;
-+-
-+-        iocache.s[1].cmd = 2;
-+-        iocache.s[1].addr = ff_hevc_buf_base(s->frame->buf[1]) + base;
-++        if (curr_uv < 0) curr_uv = 0;
-++        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
-++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-++        base = s->frame->linesize[1] * curr_uv;
-++        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-++        iocache.s[0].handle = p->vcsm_handle;
-++        iocache.s[0].cmd = 3; // clean+invalidate
-++        iocache.s[0].addr = p->arm + base;
-++        iocache.s[0].size  = sz;
-++        p = av_buffer_pool_opaque(s->frame->buf[2]);
-++        iocache.s[1].handle = p->vcsm_handle;
-++        iocache.s[1].cmd = 3; // clean+invalidate
-++        iocache.s[1].addr = p->arm + base;
-+         iocache.s[1].size  = sz;
-+-
-+-        iocache.s[2].cmd = 2;
-+-        iocache.s[2].addr = ff_hevc_buf_base(s->frame->buf[2]) + base;
-+-        iocache.s[2].size  = sz;
-+-
-+-        vcsm_clean_invalid( gpu_get_mailbox(), &iocache );
-+-
-++        vcsm_clean_invalid( &iocache );
-+ #else
-+         flush_buffer(s->frame->buf[1]);
-+         flush_buffer(s->frame->buf[2]);
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index feb3284..aa65a77 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -211,6 +211,7 @@ static void gpu_unlock(void) {
-+ }
-+ 
-+ static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-++  p->numbytes = numbytes;
-+   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+   assert(p->vcsm_handle);
-+   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+@@ -243,13 +244,25 @@ int gpu_get_mailbox(void)
-+   return gpu->mb;
-+ }
-+ 
-++// Call this to clean and invalidate a region of memory
-+ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ {
-+-  void *tmp = vcsm_lock(p->vcsm_handle);
-+-  vcsm_unlock_ptr(tmp);
-++#define RPI_FAST_CACHEFLUSH
-++#ifdef RPI_FAST_CACHEFLUSH
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    iocache.s[0].handle = p->vcsm_handle;
-++    iocache.s[0].cmd = 3; // clean+invalidate
-++    iocache.s[0].addr = p->arm;
-++    iocache.s[0].size  = p->numbytes;
-++    vcsm_clean_invalid( &iocache );
-++#else
-++    void *tmp = vcsm_lock(p->vcsm_handle);
-++    vcsm_unlock_ptr(tmp);
-++#endif
-+ }
-+ 
-+ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-++  p->numbytes = numbytes;
-+   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 2f08f03..0565a60 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -1,6 +1,8 @@
-+ #ifndef RPI_QPU_H
-+ #define RPI_QPU_H
-+ 
-++#define RPI_FAST_CACHEFLUSH
-++
-+ typedef struct gpu_mem_ptr_s {
-+   unsigned char *arm; // Pointer to memory mapped on ARM side
-+   int vc_handle;   // Videocore handle of relocatable memory
-+diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
-+index 95e6de1..db41a4d 100644
-+--- a/libavcodec/rpi_user_vcsm.h
-++++ b/libavcodec/rpi_user_vcsm.h
-+@@ -1,29 +1,41 @@
-+-/*
-+-Copyright (c) 2012, Broadcom Europe Ltd
-+-All rights reserved.
-+-
-+-Redistribution and use in source and binary forms, with or without
-+-modification, are permitted provided that the following conditions are met:
-+-    * Redistributions of source code must retain the above copyright
-+-      notice, this list of conditions and the following disclaimer.
-+-    * Redistributions in binary form must reproduce the above copyright
-+-      notice, this list of conditions and the following disclaimer in the
-+-      documentation and/or other materials provided with the distribution.
-+-    * Neither the name of the copyright holder nor the
-+-      names of its contributors may be used to endorse or promote products
-+-      derived from this software without specific prior written permission.
-+-
-+-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+-*/
-++/*****************************************************************************
-++* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
-++*
-++* This program is the proprietary software of Broadcom Corporation and/or
-++* its licensors, and may only be used, duplicated, modified or distributed
-++* pursuant to the terms and conditions of a separate, written license
-++* agreement executed between you and Broadcom (an "Authorized License").
-++* Except as set forth in an Authorized License, Broadcom grants no license
-++* (express or implied), right to use, or waiver of any kind with respect to
-++* the Software, and Broadcom expressly reserves all rights in and to the
-++* Software and all intellectual property rights therein.  IF YOU HAVE NO
-++* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-++* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-++* THE SOFTWARE.
-++*
-++* Except as expressly set forth in the Authorized License,
-++* 1. This program, including its structure, sequence and organization,
-++*    constitutes the valuable trade secrets of Broadcom, and you shall use
-++*    all reasonable efforts to protect the confidentiality thereof, and to
-++*    use this information only in connection with your use of Broadcom
-++*    integrated circuit products.
-++* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-++*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-++*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-++*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-++*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-++*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-++*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-++*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-++* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-++*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-++*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-++*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-++*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-++*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-++*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-++*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-++*****************************************************************************/
-+ 
-+ #ifndef __USER_VCSM__H__INCLUDED__
-+ #define __USER_VCSM__H__INCLUDED__
-+@@ -424,21 +436,21 @@ int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+ **
-+ ** structure contains a list of flush/invalidate commands. Commands are:
-+ ** 0: nop
-+-** 1: invalidate given physical range in L2
-+-** 2: clean      given physical range in L2
-+-** 3: clean+invalidate all of L1
-+-** 4: flush      all of L2 and all of L1
-++** 1: invalidate       given virtual range in L1/L2
-++** 2: clean            given virtual range in L1/L2
-++** 3: clean+invalidate given virtual range in L1/L2
-++** 4: flush all L1/L2
-+ */
-+ struct vcsm_user_clean_invalid_s {
-+-    struct {
-+-       unsigned int cmd;
-+-       unsigned int addr;
-+-       unsigned int size;
-+-    } s[8];
-++   struct {
-++      unsigned int cmd;
-++      unsigned int handle;
-++      unsigned int addr;
-++      unsigned int size;
-++   } s[8];
-+ };
-+ 
-+-int vcsm_clean_invalid( unsigned int handle, struct vcsm_user_clean_invalid_s *s );
-+-
-++int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
-+ 
-+ #ifdef __cplusplus
-+ }
-+-- 
-+2.5.0
-+
-+
-+From 7ddf50b155ce8417e8b27735098b3651567f07e5 Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Sat, 23 May 2015 21:10:10 +0100
-+Subject: [PATCH 46/68] Fix multi mailbox extra transform call
-+
-+---
-+ libavcodec/hevc.c | 2 ++
-+ 1 file changed, 2 insertions(+)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 31bbf67..f479707 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3011,7 +3011,9 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI_INTER_QPU
-+         rpi_execute_inter_qpu(s);
-+ #endif
-++#ifndef RPI_MULTI_MAILBOX
-+         rpi_execute_transform(s);
-++#endif
-+         rpi_execute_inter_cmds(s);
-+         vpu_wait(s->vpu_id);
-+         rpi_execute_pred_cmds(s);
-+-- 
-+2.5.0
-+
-+
-+From 9d16a24e225841b0ba09006edcd052ac2ccaf335 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 27 May 2015 16:44:29 +0100
-+Subject: [PATCH 47/68] Added support for running luma prediction on QPUs
-+
-+---
-+ libavcodec/hevc.c          |  237 +++++++-
-+ libavcodec/hevc.h          |   26 +-
-+ libavcodec/hevc_filter.c   |   23 +-
-+ libavcodec/rpi_qpu.c       |  156 ++++--
-+ libavcodec/rpi_qpu.h       |    8 +-
-+ libavcodec/rpi_shader.c    | 1313 ++++++++++++++++++++++----------------------
-+ libavcodec/rpi_shader.h    |   21 +-
-+ libavcodec/rpi_shader.qasm |  883 ++++++++++++++---------------
-+ 8 files changed, 1464 insertions(+), 1203 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index f479707..c6b619b 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -50,6 +50,11 @@
-+     // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-+     #define RPI_MULTI_MAILBOX
-+   #endif
-++
-++  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-++  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
-++
-++
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -72,6 +77,13 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+ 
-++// Split image of 2048 into parts 64 wide
-++// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
-++// Each block of 64*64
-++// Smallest CTU size is 16x16, so smallest block is 8x8
-++// Corresponds to a total of 83kbytes over all 12 QPUs
-++#define RPI_LUMA_COMMAND_WORDS 9
-++#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
-+ 
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+@@ -2002,10 +2014,46 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-++#ifdef RPI_LUMA_QPU
-++        if (s->enable_rpi) {
-++            int reflist = 0;
-++            const Mv *mv         = &current_mv.mv[reflist];
-++            int mx          = mv->x & 3;
-++            int my          = mv->y & 3;
-++            int my_mx = (my<<8) + mx;
-++            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-++            int x1 = x0 + (mv->x >> 2);
-++            int y1 = y0 + (mv->y >> 2);
-++            int chan = x0>>6; // 64 wide blocks per QPU
-++            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-++            uint32_t *y = s->y_mvs[chan % 12];
-++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-++              for(int start_x=0;start_x < nPbW;start_x+=16) {
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-++                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = my2_mx2_my_mx;
-++                  if (weight_flag) {
-++                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-++                  } else {
-++                      *y++ = 1; // Weight of 1 and offset of 0
-++                  }
-++                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-++                }
-++            }
-++            s->y_mvs[chan % 12] = y;
-++        } else
-++#endif
-++        {
-++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
-+                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
-+                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-+                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
-++        }
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+ #ifdef RPI_INTER_QPU
-+@@ -2065,10 +2113,47 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-++#ifdef RPI_LUMA_QPU
-++        if (s->enable_rpi) {
-++            int reflist = 1;
-++            const Mv *mv    = &current_mv.mv[reflist];
-++            int mx          = mv->x & 3;
-++            int my          = mv->y & 3;
-++            int my_mx = (my<<8) + mx;
-++            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-++            int x1 = x0 + (mv->x >> 2);
-++            int y1 = y0 + (mv->y >> 2);
-++            int chan = x0>>6; // 64 wide blocks per QPU
-++            int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-++                              (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-++            uint32_t *y = s->y_mvs[chan % 12];
-++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-++              for(int start_x=0;start_x < nPbW;start_x+=16) {
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-++                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = my2_mx2_my_mx;
-++                  if (weight_flag) {
-++                      *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-++                  } else {
-++                      *y++ = 1; // Weight of 1 and offset of 0
-++                  }
-++                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-++                }
-++            }
-++            s->y_mvs[chan % 12] = y;
-++        } else
-++#endif
-++
-++        {
-++            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
-+                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
-+                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-+                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
-++        }
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+ #ifdef RPI_INTER_QPU
-+@@ -2102,8 +2187,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-+                       if (weight_flag) {
-+-                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][0] & 0xffff);
-+-                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[1]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[1]][1] & 0xffff);
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][0] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][0] & 0xffff);
-++                          *u++ = (s->sh.chroma_offset_l0[current_mv.ref_idx[reflist]][1] << 16) + (s->sh.chroma_weight_l0[current_mv.ref_idx[reflist]][1] & 0xffff);
-+                       } else {
-+                           *u++ = 1; // Weight of 1 and offset of 0
-+                           *u++ = 1;
-+@@ -2130,9 +2215,44 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-+         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
-+ 
-+-        RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-++#ifdef RPI_LUMA_QPU
-++        if (s->enable_rpi) {
-++            const Mv *mv    = &current_mv.mv[0];
-++            int mx          = mv->x & 3;
-++            int my          = mv->y & 3;
-++            int my_mx = (my<<8) + mx;
-++            const Mv *mv2    = &current_mv.mv[1];
-++            int mx2          = mv2->x & 3;
-++            int my2          = mv2->y & 3;
-++            int my2_mx2 = (my2<<8) + mx2;
-++            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
-++            int x1 = x0 + (mv->x >> 2);
-++            int y1 = y0 + (mv->y >> 2);
-++            int x2 = x0 + (mv2->x >> 2);
-++            int y2 = y0 + (mv2->y >> 2);
-++            int chan = x0>>6; // 64 wide blocks per QPU
-++            uint32_t *y = s->y_mvs[chan % 12];
-++            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-++              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-++                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = my2_mx2_my_mx;
-++                  *y++ = 1; // B frame weighted prediction not supported
-++                  *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-++                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-++                }
-++            }
-++            s->y_mvs[chan % 12] = y;
-++        } else
-++#endif
-++        {
-++            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
-+                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
-+                    ref1->frame, &current_mv.mv[1], &current_mv);
-++        }
-+ 
-+         if (s->ps.sps->chroma_format_idc) {
-+ #ifdef RPI_INTER_QPU
-+@@ -2821,7 +2941,6 @@ static void rpi_inter_clear(HEVCContext *s)
-+         *s->u_mvs[i]++ = pic_height;
-+         *s->u_mvs[i]++ = s->frame->linesize[1];
-+         *s->u_mvs[i]++ = s->frame->linesize[2];
-+-        *s->u_mvs[i]++ = i;
-+         if (weight_flag) {
-+             *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-+             *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-+@@ -2829,7 +2948,31 @@ static void rpi_inter_clear(HEVCContext *s)
-+             *s->u_mvs[i]++ = 1 << 5;
-+             *s->u_mvs[i]++ = 6;
-+         }
-++        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-++    }
-++
-++#ifdef RPI_LUMA_QPU
-++    for(i=0;i<12;i++) {
-++        s->y_mvs[i] = s->y_mvs_base[i];
-++        *s->y_mvs[i]++ = 0; // y_x
-++        *s->y_mvs[i]++ = 0; // ref_y_base
-++        *s->y_mvs[i]++ = 0; // y2_x2
-++        *s->y_mvs[i]++ = 0; // ref_y2_base
-++        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-++        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
-++        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
-++        if (weight_flag) {
-++            int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
-++            int shift = s->sh.luma_log2_weight_denom + 6;
-++            *s->y_mvs[i]++ = (offset << 16) + shift;
-++        } else {
-++            int offset = 1 << 5;
-++            int shift = 6;
-++            *s->y_mvs[i]++ = (offset << 16) + shift;
-++        }
-++        *s->y_mvs[i]++ = 0; // Next kernel
-+     }
-++#endif
-+ }
-+ 
-+ static void rpi_execute_inter_qpu(HEVCContext *s)
-+@@ -2837,6 +2980,9 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+     int k;
-+     int i;
-+     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-++#ifdef RPI_LUMA_QPU
-++    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
-++#endif
-+     if (s->sh.slice_type == I_SLICE) {
-+ #ifdef RPI_MULTI_MAILBOX
-+       rpi_execute_transform(s);
-+@@ -2852,8 +2998,23 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ 
-++#ifdef RPI_LUMA_QPU
-++    for(k=0;k<12;k++) {
-++        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-++        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-++    }
-++    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++#endif
-++
-++
-+ #ifdef RPI_MULTI_MAILBOX
-++#ifdef RPI_CACHE_UNIF_MVS
-++    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
-++#else
-+     gpu_cache_flush(&s->coeffs_buf_accelerated);
-++#endif
-+     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+@@ -2863,7 +3024,27 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+                                    (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++#ifdef RPI_LUMA_QPU
-++                                   qpu_get_fn(QPU_MC_SETUP),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
-++#else
-++                                   0,
-++                                   0,0,0,0,
-++                                   0,0,0,0,
-++                                   0,0,0,0
-++#endif
-+                                  );
-+     for(i=0;i<4;i++)
-+         s->num_coeffs[i] = 0;
-+@@ -2879,6 +3060,8 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+       (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+       );
-+ #endif
-++
-++
-+ }
-+ #endif
-+ 
-+@@ -3502,8 +3685,7 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
-+ fail:
-+     if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+ #ifdef RPI_INTER_QPU
-+-        void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-+-        ff_hevc_flush_chroma(s, &s->ref->tf, s->ps.sps->height);
-++        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-+     }
-+@@ -3690,7 +3872,6 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+ 
-+ #ifdef RPI
-+     av_freep(&s->unif_mv_cmds);
-+-    av_freep(&s->unif_xfm_cmds);
-+     av_freep(&s->univ_pred_cmds);
-+ 
-+ #ifdef RPI_INTER_QPU
-+@@ -3699,7 +3880,12 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+         s->unif_mvs = 0;
-+     }
-+ #endif
-+-    //gpu_free(&s->dummy);
-++#ifdef RPI_LUMA_QPU
-++    if (s->y_unif_mvs) {
-++        gpu_free( &s->y_unif_mvs_ptr );
-++        s->y_unif_mvs = 0;
-++    }
-++#endif
-+ 
-+ #ifdef EARLY_MALLOC
-+     printf("hevc_decode_free\n");
-+@@ -3789,9 +3975,6 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+     if (!s->unif_mv_cmds)
-+         goto fail;
-+-    s->unif_xfm_cmds = av_mallocz(sizeof(HEVCXfmCmd)*RPI_MAX_XFM_CMDS);
-+-    if (!s->unif_xfm_cmds)
-+-        goto fail;
-+     s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+     if (!s->univ_pred_cmds)
-+         goto fail;
-+@@ -3805,7 +3988,11 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     {
-+         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+         uint32_t *p;
-++#ifdef RPI_CACHE_UNIF_MVS
-++        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++#else
-+         gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++#endif
-+         s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-+ 
-+         // Set up initial locations for uniform streams
-+@@ -3820,6 +4007,28 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+ 
-+     }
-+ #endif
-++#ifdef RPI_LUMA_QPU
-++    {
-++        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-++        uint32_t *p;
-++#ifdef RPI_CACHE_UNIF_MVS
-++        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-++#else
-++        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-++#endif
-++        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-++
-++        // Set up initial locations for uniform streams
-++        p = s->y_unif_mvs;
-++        for(i = 0; i < 12; i++) {
-++            s->y_mvs_base[i] = p;
-++            p += y_commands_per_qpu;
-++        }
-++        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-++        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
-++
-++    }
-++#endif
-+     //gpu_malloc_uncached(2048*64,&s->dummy);
-+ 
-+ #ifdef EARLY_MALLOC
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 3511982..33dedf7 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -43,9 +43,13 @@
-+ #ifdef RPI
-+ 
-+   #include "rpi_qpu.h"
-+-  // Use QPU for inter prediction
-++  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
-+   #define RPI_INTER_QPU
-+ 
-++  #ifdef RPI_INTER_QPU
-++    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-++    #define RPI_LUMA_QPU
-++  #endif
-+ #endif
-+ 
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+@@ -835,7 +839,6 @@ typedef struct HEVCLocalContext {
-+ 
-+ // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-+ #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
-+-#define RPI_MAX_XFM_CMDS  (16*3*(RPI_MAX_WIDTH/4))
-+ // Each block can have an intra prediction and a transform_add command
-+ #define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
-+ // Worst case is 16x16 CTUs
-+@@ -870,9 +873,6 @@ typedef struct HEVCMvCmd {
-+     int8_t ref_idx[2];
-+ } HEVCMvCmd;
-+ 
-+-// Command for transform to process a block of coefficients
-+-typedef struct HEVCXfmCmd {
-+-} HEVCXfmCmd;
-+ 
-+ // Command for intra prediction and transform_add of predictions to coefficients
-+ #define RPI_PRED_TRANSFORM_ADD 0
-+@@ -918,8 +918,7 @@ typedef struct HEVCContext {
-+ 
-+ #ifdef RPI
-+     int enable_rpi;
-+-    HEVCMvCmd *unif_mv_cmds;  // TODO rename
-+-    HEVCXfmCmd *unif_xfm_cmds;
-++    HEVCMvCmd *unif_mv_cmds;
-+     HEVCPredCmd *univ_pred_cmds;
-+     int buf_width;
-+     GPU_MEM_PTR_T coeffs_buf_default;
-+@@ -946,6 +945,15 @@ typedef struct HEVCContext {
-+     uint32_t mc_filter_uv_b0;
-+     uint32_t mc_filter_uv_b;
-+ #endif
-++#ifdef RPI_LUMA_QPU
-++    GPU_MEM_PTR_T y_unif_mvs_ptr;
-++    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
-++    uint32_t *y_mvs_base[12];
-++    uint32_t *y_mvs[12];
-++    // Function pointers
-++    uint32_t mc_filter;
-++    uint32_t mc_filter_b;
-++#endif
-+ 
-+ #endif
-+ 
-+@@ -1181,6 +1189,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                                  int log2_trafo_size, enum ScanType scan_idx,
-+                                  int c_idx);
-+ 
-++#ifdef RPI_INTER_QPU
-++extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
-++#endif
-++
-+ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
-+ 
-+ 
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index ec84e8a..11629e4 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -883,8 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
-+   return p->vc & 0x3fffffff;
-+ }
-+ 
-+-void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n);
-+-void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-++void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+     if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+             s->nal_unit_type == NAL_TSA_N   ||
-+@@ -911,10 +910,24 @@ void ff_hevc_flush_chroma(HEVCContext *s, ThreadFrame *f, int n)
-+         iocache.s[1].cmd = 3; // clean+invalidate
-+         iocache.s[1].addr = p->arm + base;
-+         iocache.s[1].size  = sz;
-++
-++#ifdef RPI_LUMA_QPU
-++        p = av_buffer_pool_opaque(s->frame->buf[0]);
-++        sz = s->frame->linesize[0] * (n-curr_y);
-++        base = s->frame->linesize[0] * curr_y;
-++        iocache.s[2].handle = p->vcsm_handle;
-++        iocache.s[2].cmd = 3; // clean+invalidate
-++        iocache.s[2].addr = p->arm + base;
-++        iocache.s[2].size  = sz;
-++#endif
-+         vcsm_clean_invalid( &iocache );
-+ #else
-+         flush_buffer(s->frame->buf[1]);
-+         flush_buffer(s->frame->buf[2]);
-++#ifdef RPI_LUMA_QPU
-++        flush_buffer(s->frame->buf[1]);
-++#endif
-++
-+ #endif
-+         //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+         //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+@@ -938,7 +951,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x, y - ctb_size);
-+             if (s->threads_type & FF_THREAD_FRAME ) {
-+ #ifdef RPI_INTER_QPU
-+-                ff_hevc_flush_chroma(s,&s->ref->tf, y);
-++                ff_hevc_flush_buffer(s,&s->ref->tf, y);
-+ #endif
-+                 ff_thread_report_progress(&s->ref->tf, y, 0);
-+             }
-+@@ -947,7 +960,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+             sao_filter_CTB(s, x , y);
-+             if (s->threads_type & FF_THREAD_FRAME ) {
-+ #ifdef RPI_INTER_QPU
-+-                ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size);
-++                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
-+ #endif
-+                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
-+             }
-+@@ -957,7 +970,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-+ #ifdef RPI_INTER_QPU
-+-        ff_hevc_flush_chroma(s, &s->ref->tf, y + ctb_size - 4);
-++        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index aa65a77..e12304b 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -1,9 +1,11 @@
-+ #ifdef RPI
-+ // This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+ // define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+-#define RPI_TIME_TOTAL_QPU
-++//#define RPI_TIME_TOTAL_QPU
-+ // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+ //#define RPI_TIME_TOTAL_VPU
-++// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
-++//#define RPI_TIME_TOTAL_POSTED
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+ #define RPI_ASYNC
-+ 
-+@@ -94,7 +96,8 @@ struct GPU
-+   int open_count; // Number of allocated video buffers
-+   int      mb; // Mailbox handle
-+   int      vc; // Address in GPU memory
-+-  int mail[12]; // These are used to pass pairs of code/unifs to the QPUs
-++  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
-++  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
-+ };
-+ 
-+ // Stop more than one thread trying to allocate memory or use the processing resources at once
-+@@ -102,7 +105,7 @@ static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ static volatile struct GPU* gpu = NULL;
-+ static GPU_MEM_PTR_T gpu_mem_ptr;
-+ 
-+-#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU)
-++#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
-+ static unsigned int Microseconds(void) {
-+     struct timespec ts;
-+     unsigned int x;
-+@@ -123,7 +126,7 @@ static pthread_cond_t post_cond_head = PTHREAD_COND_INITIALIZER;
-+ static pthread_cond_t post_cond_tail = PTHREAD_COND_INITIALIZER;
-+ static pthread_mutex_t post_mutex = PTHREAD_MUTEX_INITIALIZER;
-+ 
-+-static int vpu_cmds[MAXCMDS][16];
-++static int vpu_cmds[MAXCMDS][32];
-+ static volatile int vpu_async_tail=0; // Contains the number of posted jobs
-+ static volatile int vpu_async_head=0;
-+ #endif
-+@@ -247,7 +250,6 @@ int gpu_get_mailbox(void)
-+ // Call this to clean and invalidate a region of memory
-+ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ {
-+-#define RPI_FAST_CACHEFLUSH
-+ #ifdef RPI_FAST_CACHEFLUSH
-+     struct vcsm_user_clean_invalid_s iocache = {};
-+     iocache.s[0].handle = p->vcsm_handle;
-+@@ -261,6 +263,34 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+ #endif
-+ }
-+ 
-++void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-++{
-++#ifdef RPI_FAST_CACHEFLUSH
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    iocache.s[0].handle = p0->vcsm_handle;
-++    iocache.s[0].cmd = 3; // clean+invalidate
-++    iocache.s[0].addr = (int) p0->arm;
-++    iocache.s[0].size  = p0->numbytes;
-++    iocache.s[1].handle = p1->vcsm_handle;
-++    iocache.s[1].cmd = 3; // clean+invalidate
-++    iocache.s[1].addr = (int) p1->arm;
-++    iocache.s[1].size  = p1->numbytes;
-++    iocache.s[2].handle = p2->vcsm_handle;
-++    iocache.s[2].cmd = 3; // clean+invalidate
-++    iocache.s[2].addr = (int) p2->arm;
-++    iocache.s[2].size  = p2->numbytes;
-++    vcsm_clean_invalid( &iocache );
-++#else
-++    void *tmp;
-++    tmp = vcsm_lock(p0->vcsm_handle);
-++    vcsm_unlock_ptr(tmp);
-++    tmp = vcsm_lock(p1->vcsm_handle);
-++    vcsm_unlock_ptr(tmp);
-++    tmp = vcsm_lock(p2->vcsm_handle);
-++    vcsm_unlock_ptr(tmp);
-++#endif
-++}
-++
-+ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+   p->numbytes = numbytes;
-+   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+@@ -357,9 +387,19 @@ unsigned int vpu_get_constants(void) {
-+ #ifdef RPI_ASYNC
-+ 
-+ static void *vpu_start(void *arg) {
-++#ifdef RPI_TIME_TOTAL_POSTED
-++  int last_time=0;
-++  long long on_time=0;
-++  long long off_time=0;
-++  int start_time;
-++  int end_time;
-++  int count=0;
-++#endif
-+   while(1) {
-++    int i;
-+     int *p;
-+     int qpu_code;
-++    int qpu_codeb;
-+     pthread_mutex_lock(&post_mutex);
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+@@ -373,24 +413,49 @@ static void *vpu_start(void *arg) {
-+       break; // Last job
-+     }
-+     qpu_code = p[7];
-++    qpu_codeb = p[16];
-+     //if (p[7]) {
-+         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+         //gpu_cache_flush(buf);
-+     //}
-++
-++#ifdef RPI_TIME_TOTAL_POSTED
-++    start_time = Microseconds();
-++    if (last_time==0)
-++      last_time = start_time;
-++    off_time += start_time-last_time;
-++#endif
-++
-+     if (!qpu_code) {
-+       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+     } else {
-+-      int i;
-+       for(i=0;i<8;i++) {
-+         gpu->mail[i*2] = p[8+i];
-+         gpu->mail[i*2 + 1] = qpu_code;
-+       }
-+-
-+-      execute_multi(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+-                              0, 0, 0, 0,
-++      for(i=0;i<12;i++) {
-++        gpu->mail2[i*2] = p[17+i];
-++        gpu->mail2[i*2 + 1] = qpu_codeb;
-++      }
-++#if (0)
-++      vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-++      execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
-++#else
-++      execute_multi(gpu->mb,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-++#endif
-+     }
-++#ifdef RPI_TIME_TOTAL_POSTED
-++    end_time = Microseconds();
-++    last_time = end_time;
-++    on_time += end_time - start_time;
-++    count++;
-++    if ((count&0x7f)==0)
-++      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+     pthread_cond_broadcast(&post_cond_head);
-+@@ -436,7 +501,9 @@ int vpu_post_code(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned
-+ }
-+ 
-+ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+-                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8)
-++                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
-++                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
-++                      )
-+ {
-+ 
-+   pthread_mutex_lock(&post_mutex);
-+@@ -464,6 +531,21 @@ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2,
-+     p[13] = unifs6;
-+     p[14] = unifs7;
-+     p[15] = unifs8;
-++
-++    p[16] = qpu_codeb;
-++    p[17] = unifs1b;
-++    p[18] = unifs2b;
-++    p[19] = unifs3b;
-++    p[20] = unifs4b;
-++    p[21] = unifs5b;
-++    p[22] = unifs6b;
-++    p[23] = unifs7b;
-++    p[24] = unifs8b;
-++    p[25] = unifs9b;
-++    p[26] = unifs10b;
-++    p[27] = unifs11b;
-++    p[28] = unifs12b;
-++
-+     if (num<=1)
-+       pthread_cond_broadcast(&post_cond_tail); // Otherwise the vpu thread must already be awake
-+     pthread_mutex_unlock(&post_mutex);
-+@@ -544,27 +626,27 @@ void qpu_run_shader12(int code, int num, int code2, int num2, int unifs1, int un
-+   off_time += start_time-last_time;
-+ #endif
-+   for(i=0;i<num;i++) {
-+-    gpu->mail[i*2 + 1] = code;
-++    gpu->mail2[i*2 + 1] = code;
-+   }
-+   for(;i<num+num2;i++) {
-+-    gpu->mail[i*2 + 1] = code2;
-++    gpu->mail2[i*2 + 1] = code2;
-+   }
-+-  gpu->mail[0 ] = unifs1;
-+-  gpu->mail[2 ] = unifs2;
-+-  gpu->mail[4 ] = unifs3;
-+-  gpu->mail[6 ] = unifs4;
-+-  gpu->mail[8 ] = unifs5;
-+-  gpu->mail[10] = unifs6;
-+-	gpu->mail[12] = unifs7;
-+-	gpu->mail[14] = unifs8;
-+-	gpu->mail[16] = unifs9;
-+-	gpu->mail[18] = unifs10;
-+-	gpu->mail[20] = unifs11;
-+-	gpu->mail[22] = unifs12;
-++  gpu->mail2[0 ] = unifs1;
-++  gpu->mail2[2 ] = unifs2;
-++  gpu->mail2[4 ] = unifs3;
-++  gpu->mail2[6 ] = unifs4;
-++  gpu->mail2[8 ] = unifs5;
-++  gpu->mail2[10] = unifs6;
-++	gpu->mail2[12] = unifs7;
-++	gpu->mail2[14] = unifs8;
-++	gpu->mail2[16] = unifs9;
-++	gpu->mail2[18] = unifs10;
-++	gpu->mail2[20] = unifs11;
-++	gpu->mail2[22] = unifs12;
-+ 	execute_qpu(
-+ 		gpu->mb,
-+ 		12 /* Number of QPUs */,
-+-		gpu->vc + offsetof(struct GPU, mail),
-++		gpu->vc + offsetof(struct GPU, mail2),
-+ 		1 /* no flush */,  // Don't flush VPU L1 cache
-+ 		5000 /* timeout ms */);
-+ #ifdef RPI_TIME_TOTAL_QPU
-+@@ -635,21 +717,21 @@ unsigned int qpu_get_fn(int num) {
-+       gpu_unlock();
-+     }
-+     switch(num) {
-+-    //case QPU_MC_SETUP:
-+-    //  fn = mc_setup;
-+-    //  break;
-+-    //case QPU_MC_FILTER:
-+-    //  fn = mc_filter;
-+-    //  break;
-++    case QPU_MC_SETUP:
-++      fn = mc_setup;
-++      break;
-++    case QPU_MC_FILTER:
-++      fn = mc_filter;
-++      break;
-+     case QPU_MC_EXIT:
-+       fn = mc_exit;
-+       break;
-+-    //case QPU_MC_INTERRUPT_EXIT:
-+-    //  fn = mc_interrupt_exit;
-+-    //  break;
-+-    //case QPU_MC_FILTER_B:
-+-    //  fn = mc_filter_b;
-+-    //  break;
-++    case QPU_MC_INTERRUPT_EXIT12:
-++      fn = mc_interrupt_exit12;
-++      break;
-++    case QPU_MC_FILTER_B:
-++      fn = mc_filter_b;
-++      break;
-+     //case QPU_MC_FILTER_HONLY:
-+     //  fn = mc_filter_honly;
-+     //  break;
-+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-+index 0565a60..81c2bb1 100644
-+--- a/libavcodec/rpi_qpu.h
-++++ b/libavcodec/rpi_qpu.h
-+@@ -1,6 +1,7 @@
-+ #ifndef RPI_QPU_H
-+ #define RPI_QPU_H
-+ 
-++// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
-+ #define RPI_FAST_CACHEFLUSH
-+ 
-+ typedef struct gpu_mem_ptr_s {
-+@@ -16,6 +17,7 @@ extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+ extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+ extern void gpu_free(GPU_MEM_PTR_T *p);
-+ extern void gpu_cache_flush(GPU_MEM_PTR_T *p);
-++extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-+ 
-+ // QPU specific functions
-+ extern void qpu_run_shader8(int code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-+@@ -26,7 +28,7 @@ enum {
-+   QPU_MC_SETUP,
-+   QPU_MC_FILTER,
-+   QPU_MC_EXIT,
-+-  QPU_MC_INTERRUPT_EXIT,
-++  QPU_MC_INTERRUPT_EXIT12,
-+   QPU_MC_FILTER_B,
-+   QPU_MC_FILTER_HONLY,
-+   QPU_MC_SETUP_UV,
-+@@ -44,7 +46,9 @@ extern unsigned int vpu_get_constants(void);
-+ extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+ extern int vpu_post_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-+ int vpu_qpu_post_code(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+-                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8);
-++                      int qpu_code, int unifs1, int unifs2, int unifs3, int unifs4, int unifs5, int unifs6, int unifs7, int unifs8,
-++                      int qpu_codeb, int unifs1b, int unifs2b, int unifs3b, int unifs4b, int unifs5b, int unifs6b, int unifs7b, int unifs8b, int unifs9b, int unifs10b, int unifs11b, int unifs12b
-++                      );
-+ extern void vpu_wait( int id);
-+ 
-+ // Simple test of shader code
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index a0f0282..e86eb30 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -48,693 +48,674 @@ unsigned int rpi_shader[] = {
-+ /* [0x000000b8] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+ /* [0x000000c0] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+ /* [0x000000c8] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x000000d0] */ 0x15827d80, 0x100208e7, // mov r3, unif
-+-/* [0x000000d8] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-+-/* [0x000000e0] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x000000e8] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x000000f0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000100] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000108] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000110] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000118] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000120] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000128] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000130] */ 0x119c17c0, 0xd00208a7, // shl r2, r3, 1
-+-/* [0x00000138] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+-/* [0x00000140] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000148] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000150] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000158] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000160] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000168] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000170] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000178] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000180] */ 0x0f9c11c0, 0xd0020827, // asr r0, r0, 1
-+-/* [0x00000188] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+-/* [0x00000190] */ 0x0c9e7040, 0x10021567, // add rb21, r0, r1
-+-/* [0x00000198] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+-/* [0x000001a0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+-/* [0x000001a8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+-/* [0x000001b0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+-/* [0x000001b8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x000001c0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000001c8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000001d0] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-++/* [0x000000d0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-++/* [0x000000d8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-++/* [0x000000e0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-++/* [0x000000e8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-++/* [0x000000f0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x000000f8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000100] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000108] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-++/* [0x00000110] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000118] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-++/* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-++/* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-++/* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
-++/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
-++/* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
-++/* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
-++/* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-++/* [0x00000158] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-++/* [0x00000160] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000168] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000170] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000178] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000180] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000188] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000190] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000198] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x000001a0] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
-++/* [0x000001a8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-++/* [0x000001b0] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
-++/* [0x000001b8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x000001c0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x000001c8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x000001d0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+ /* [0x000001d8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x000001e0] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+-/* [0x000001e8] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+-/* [0x000001f0] */ 0x0c9e7440, 0x10020e27, // add t0s, r2, r1
-+-/* [0x000001f8] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000200] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000208] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000210] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000218] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000220] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000228] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000230] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+-/* [0x00000238] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x000001e0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x000001e8] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000001f0] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x000001f8] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-++/* [0x00000200] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
-+ // ::mc_filter_uv
-+-/* [0x00000240] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000248] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000250] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000258] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000260] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000268] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000270] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000278] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000280] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x00000288] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000290] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x00000298] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000002a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000002a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000002b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000002b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000002c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000002c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000002d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000002d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000002e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000002e8] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000002f0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000002f8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000300] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000308] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000310] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000318] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000320] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000328] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000330] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000338] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000340] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000348] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000350] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000358] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000360] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000368] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000370] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-+-/* [0x00000378] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000380] */ 0x0f9e7080, 0x100613a7, // asr.ifnz rb14, r0, r2
-+-/* [0x00000388] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000390] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000208] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000210] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000230] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000238] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000240] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000248] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000250] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000260] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000268] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000270] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000278] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000280] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000288] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000290] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000298] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000002a0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000002a8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x000002b0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x000002b8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x000002c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000002c8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000002d0] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000002d8] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000002e0] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000002e8] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000002f0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000002f8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000300] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000308] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000310] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000318] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-++/* [0x00000320] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000328] */ 0x0f9e7080, 0x100208e7, // asr r3, r0, r2
-++/* [0x00000330] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000338] */ 0x0f9e7080, 0x100613e7, // asr.ifnz rb15, r0, r2
-++/* [0x00000340] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000348] */ 0x0f9e7080, 0x100608e7, // asr.ifnz r3, r0, r2
-++/* [0x00000350] */ 0x119c87c0, 0xd00213a7, // shl rb14,r3,8
-++/* [0x00000358] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop
-+-/* [0x00000398] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000003a0] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x000003a8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x000003b0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x000003b8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x000003c0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x000003c8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000003d0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000003d8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x000003e0] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x000003e8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000003f0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000003f8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000400] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000408] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000410] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000418] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000420] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000428] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000430] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000438] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000440] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x00000448] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000450] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000458] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000460] */ 0x00000020, 0xe0021327, // mov rb12,32
-+-/* [0x00000468] */ 0x00000006, 0xe0021367, // mov rb13,6
-+-/* [0x00000470] */ 0x00000001, 0xe00213a7, // mov rb14,1
-+-/* [0x00000478] */ 0x00000000, 0xe00213e7, // mov rb15,0
-+-/* [0x00000480] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000488] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000490] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000498] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000004a0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000004a8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000004b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000004b8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x000004c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x000004c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x000004d0] */ 0xfffffea8, 0xf06809e7, // brr.anyn -, r:uvloop
-+-/* [0x000004d8] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x000004e0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x000004e8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x000004f0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x000004f8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000500] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000508] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000510] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000518] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000520] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000528] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000530] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000360] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000368] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-++/* [0x00000370] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000378] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000380] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000388] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000390] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000398] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000003a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000003a8] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-++/* [0x000003b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000003b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000003c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000003c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000003d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000003d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000003e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x000003e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x000003f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000003f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000400] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000408] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000410] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000418] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000420] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000428] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000430] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000438] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000440] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000448] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000450] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000458] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000460] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x00000468] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x00000470] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x00000478] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop
-++/* [0x00000480] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00000488] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00000490] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00000498] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000004a0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004a8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000004b0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000004b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000004c0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000004c8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000004d0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000004d8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_uv_b0
-+-/* [0x00000538] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000540] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00000548] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000550] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000558] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x00000560] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000568] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x00000570] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x00000578] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x00000580] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000588] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x00000590] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-+-/* [0x00000598] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000005a0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005a8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000005b0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000005b8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x000005c0] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x000005c8] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x000005d0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x000005d8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x000005e0] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x000005e8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x000005f0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000005f8] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000600] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000608] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000610] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000618] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000620] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000628] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000630] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x00000638] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000640] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000648] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x00000650] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000658] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x000004e0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x000004e8] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x000004f0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x000004f8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000500] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000508] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000510] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000518] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000520] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000528] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000530] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000538] */ 0x159d5fc0, 0x10021c67, // mov vw_setup, rb21
-++/* [0x00000540] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x00000548] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000550] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x00000558] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000560] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000568] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x00000570] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x00000578] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000580] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000588] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000590] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000598] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005a0] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005a8] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005b0] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005b8] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x000005c0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005c8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x000005d0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x000005d8] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x000005e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005e8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000005f0] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x000005f8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000600] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b0
-+-/* [0x00000660] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x00000668] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x00000670] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000678] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000680] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000688] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000690] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000698] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x000006a0] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x000006a8] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x000006b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000006b8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000006c0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000006c8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000006d0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000006d8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000006e0] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000006e8] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000006f0] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x000006f8] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000700] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000708] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000710] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000718] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00000720] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00000728] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00000730] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x00000738] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x00000740] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x00000748] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x00000750] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+-/* [0x00000758] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+-/* [0x00000760] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+-/* [0x00000768] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000770] */ 0x009e7000, 0x100009e7, // nop
-+-/* [0x00000778] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000780] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000788] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000790] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000608] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000610] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-++/* [0x00000618] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00000620] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00000628] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x00000630] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00000638] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00000640] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x00000648] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x00000650] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-++/* [0x00000658] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000660] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x00000668] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x00000670] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x00000678] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000680] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000688] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000690] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000698] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x000006a0] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x000006a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x000006b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x000006b8] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x000006c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000006c8] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000006d0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000006d8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000006e0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000006e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000006f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000006f8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-++/* [0x00000700] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-++/* [0x00000708] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-++/* [0x00000710] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000718] */ 0x009e7000, 0x100009e7, // nop
-++/* [0x00000720] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000728] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000730] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000738] */ 0x009e7000, 0x100009e7, // nop
-+ // ::mc_filter_uv_b
-+-/* [0x00000798] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x000007a0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x000007a8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000007b0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x000007b8] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+-/* [0x000007c0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x000007c8] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-+-/* [0x000007d0] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+-/* [0x000007d8] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-+-/* [0x000007e0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x000007e8] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+-/* [0x000007f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000007f8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000800] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000808] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000810] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000818] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000820] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-+-/* [0x00000828] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-+-/* [0x00000830] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000838] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-+-/* [0x00000840] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-+-/* [0x00000848] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+-/* [0x00000850] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000858] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000860] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000868] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+-/* [0x00000870] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000878] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000880] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000888] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000890] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000898] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008a0] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000008a8] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000008b0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-+-/* [0x000008b8] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008c0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000008c8] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-+-/* [0x000008d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000008d8] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000740] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-++/* [0x00000748] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-++/* [0x00000750] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-++/* [0x00000758] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-++/* [0x00000760] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-++/* [0x00000768] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000770] */ 0x0d827cc0, 0x100208a7, // sub r2, unif, r3
-++/* [0x00000778] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-++/* [0x00000780] */ 0x149dc1c0, 0xd00214e7, // and rb_x_next, r0, ~3
-++/* [0x00000788] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000790] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-++/* [0x00000798] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x000007a0] */ 0x00000010, 0xe00208a7, // mov r2, 16
-++/* [0x000007a8] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x000007b0] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-++/* [0x000007b8] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x000007c0] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x000007c8] */ 0x0c9c11c0, 0xd0021467, // add rb17, r0, 1
-++/* [0x000007d0] */ 0x0c9c31c0, 0xd00214a7, // add rb18, r0, 3
-++/* [0x000007d8] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x000007e0] */ 0x119cd1c0, 0xd00208e7, // shl r3, r0, 13
-++/* [0x000007e8] */ 0x119c87c0, 0xd00208e7, // shl r3, r3, 8
-++/* [0x000007f0] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-++/* [0x000007f8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000800] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-++/* [0x00000808] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000810] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-++/* [0x00000818] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000820] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000828] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000830] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000838] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-++/* [0x00000840] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000848] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-++/* [0x00000850] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000858] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000860] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000868] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000870] */ 0xfffffff8, 0xe0021967, // mov r5rep, -8
-++/* [0x00000878] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x00000880] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :uvloop_b
-+-/* [0x000008e0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-+-/* [0x000008e8] */ 0x8e4539bf, 0xa0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-+-/* [0x000008f0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x000008f8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000900] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+-/* [0x00000908] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000910] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000918] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+-/* [0x00000920] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-/* [0x00000928] */ 0x0c627c80, 0x10020e27, // add t0s, ra_frame_base, r2
-+-/* [0x00000930] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000938] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000940] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000948] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000950] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000958] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000960] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000968] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000970] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00000978] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00000980] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+-/* [0x00000988] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x00000990] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00000998] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x000009a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x000009a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x000009b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000009b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000009c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000009c8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000009d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000009d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000009e0] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-+-/* [0x000009e8] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-+-/* [0x000009f0] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+-/* [0x000009f8] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00000a00] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00000a08] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00000a10] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00000a18] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a20] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+-/* [0x00000a28] */ 0x00000010, 0xe0020827, // mov r0, 16
-+-/* [0x00000a30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+-/* [0x00000a38] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00000a40] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+-/* [0x00000a48] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00000a50] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00000888] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0
-++/* [0x00000890] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-++/* [0x00000898] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x000008a0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x000008a8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-++/* [0x000008b0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x000008b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000008c0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-++/* [0x000008c8] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-++/* [0x000008d0] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-++/* [0x000008d8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000008e0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000008e8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000008f0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000008f8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x00000900] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00000908] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00000910] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00000918] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00000920] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00000928] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-++/* [0x00000930] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x00000938] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00000940] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00000948] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x00000950] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x00000958] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x00000960] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x00000968] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x00000970] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x00000978] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x00000980] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00000988] */ 0x0cc27380, 0x10020867, // add r1, r1, vpm
-++/* [0x00000990] */ 0x0c7a7380, 0x10020867, // add r1, r1, ra30
-++/* [0x00000998] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-++/* [0x000009a0] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-++/* [0x000009a8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x000009b0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x000009b8] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x000009c0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009c8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x000009d0] */ 0x00000010, 0xe0020827, // mov r0, 16
-++/* [0x000009d8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-++/* [0x000009e0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x000009e8] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-++/* [0x000009f0] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x000009f8] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_exit
-+-/* [0x00000a58] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000a60] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+-/* [0x00000a68] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a70] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a78] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a80] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000a88] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000a90] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+-/* [0x00000a98] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a00] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a08] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-++/* [0x00000a10] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a18] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00000a20] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a28] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00000a30] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000a38] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a40] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_interrupt_exit8
-+-/* [0x00000aa0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00000aa8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ab0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ab8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00000ac8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ad0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ad8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ae0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000ae8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000af0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000af8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00000b00] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x00000b08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x00000b10] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00000a48] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00000a50] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a58] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00000a60] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00000a68] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00000a70] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a78] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a80] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a88] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a90] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000a98] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000aa0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00000aa8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00000ab0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_setup
-+-/* [0x00000b18] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000b20] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00000b28] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000b30] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000b38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000b40] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000b48] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+-/* [0x00000b50] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000b58] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+-/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+-/* [0x00000b78] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+-/* [0x00000b80] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000b88] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000b98] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x00000ba0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-+-/* [0x00000ba8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000bb0] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+-/* [0x00000bb8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+-/* [0x00000bc0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000bc8] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+-/* [0x00000bd0] */ 0x8c9e7452, 0x10025e19, // add t0s, r2, r1 ; mov ra_frame_base2, r2
-+-/* [0x00000bd8] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+-/* [0x00000be0] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+-/* [0x00000be8] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+-/* [0x00000bf0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000bf8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+-/* [0x00000c00] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+-/* [0x00000c08] */ 0x00000001, 0xe0020527, // mov ra20, 1
-+-/* [0x00000c10] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-+-/* [0x00000c18] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+-/* [0x00000c20] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+-/* [0x00000c28] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-+-/* [0x00000c30] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+-/* [0x00000c38] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+-/* [0x00000c40] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+-/* [0x00000c48] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+-/* [0x00000c50] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+-/* [0x00000c58] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+-/* [0x00000c60] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+-/* [0x00000c68] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+-/* [0x00000c70] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+-/* [0x00000c78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000c80] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000c88] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000c90] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000c98] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000ca0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000ca8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000cb0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+-/* [0x00000cb8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+-/* [0x00000cc0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+-/* [0x00000cc8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+-/* [0x00000cd0] */ 0x159e7480, 0x10020867, // mov r1, r2
-+-/* [0x00000cd8] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+-/* [0x00000ce0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+-/* [0x00000ce8] */ 0x159e7480, 0x10020827, // mov r0, r2
-+-/* [0x00000cf0] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+-/* [0x00000cf8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000d00] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+-/* [0x00000d08] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+-/* [0x00000d10] */ 0x15827d80, 0x10021327, // mov rb12,unif
-+-/* [0x00000d18] */ 0x15827d80, 0x10021367, // mov rb13,unif
-+-/* [0x00000d20] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000d28] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+-/* [0x00000d30] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000d38] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+-/* [0x00000d40] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000d48] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+-/* [0x00000d50] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-+-/* [0x00000d58] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+-/* [0x00000d60] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00000ac0] */ 0x00000010, 0xe00208e7, // mov r3, 16
-++/* [0x00000ac8] */ 0x15827d80, 0x10020227, // mov ra8, unif
-++/* [0x00000ad0] */ 0x15827d80, 0x10020267, // mov ra9, unif
-++/* [0x00000ad8] */ 0x15827d80, 0x100202a7, // mov ra10, unif
-++/* [0x00000ae0] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-++/* [0x00000ae8] */ 0x15827d80, 0x10020867, // mov r1, unif
-++/* [0x00000af0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000af8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000b00] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000b08] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
-++/* [0x00000b10] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
-++/* [0x00000b18] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-++/* [0x00000b20] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000b28] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-++/* [0x00000b30] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-++/* [0x00000b38] */ 0x15227d80, 0x10020867, // mov r1, ra8
-++/* [0x00000b40] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000b48] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000b50] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000b58] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-++/* [0x00000b60] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-++/* [0x00000b68] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
-++/* [0x00000b70] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000b78] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-++/* [0x00000b80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000b88] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000b90] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000b98] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000ba0] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000ba8] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-++/* [0x00000bb0] */ 0x152a7d80, 0x10020867, // mov r1, ra10
-++/* [0x00000bb8] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000bc0] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000bc8] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000bd0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-++/* [0x00000bd8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-++/* [0x00000be0] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
-++/* [0x00000be8] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000bf0] */ 0x0c9c13c0, 0xd0020567, // add ra_y2, r1, 1
-++/* [0x00000bf8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000c00] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-++/* [0x00000c08] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-++/* [0x00000c10] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000c18] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-++/* [0x00000c20] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
-++/* [0x00000c28] */ 0x00000001, 0xe0020527, // mov ra20, 1
-++/* [0x00000c30] */ 0x00000100, 0xe00205a7, // mov ra22, 256
-++/* [0x00000c38] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-++/* [0x00000c40] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-++/* [0x00000c48] */ 0x000000ff, 0xe00215a7, // mov rb22, 255
-++/* [0x00000c50] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-++/* [0x00000c58] */ 0x00000000, 0xe0020227, // mov ra8, 0
-++/* [0x00000c60] */ 0x00000000, 0xe0020267, // mov ra9, 0
-++/* [0x00000c68] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-++/* [0x00000c70] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-++/* [0x00000c78] */ 0x00000000, 0xe0020327, // mov ra12, 0
-++/* [0x00000c80] */ 0x00000000, 0xe0020367, // mov ra13, 0
-++/* [0x00000c88] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-++/* [0x00000c90] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-++/* [0x00000c98] */ 0x00004000, 0xe00204a7, // mov ra18, 0x4000
-++/* [0x00000ca0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-++/* [0x00000ca8] */ 0x159e7480, 0x10020867, // mov r1, r2
-++/* [0x00000cb0] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-++/* [0x00000cb8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-++/* [0x00000cc0] */ 0x159e7480, 0x10020827, // mov r0, r2
-++/* [0x00000cc8] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-++/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000cd8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-++/* [0x00000ce0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-++/* [0x00000ce8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-++/* [0x00000cf0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-++/* [0x00000cf8] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-++/* [0x00000d00] */ 0x15827d80, 0x10020867, // mov r1, unif
-++/* [0x00000d08] */ 0x919c82ff, 0xd0024822, // shl r0,r1,r3 ; mov r2,8
-++/* [0x00000d10] */ 0x0f9e70c0, 0x10021367, // asr rb13,r0,r3
-++/* [0x00000d18] */ 0x0f9e72c0, 0x10021327, // asr rb12,r1,r3
-++/* [0x00000d20] */ 0x0c9cde80, 0x10021367, // add rb13,rb13,r2
-++/* [0x00000d28] */ 0x119cce80, 0x10021327, // shl rb12, rb12, r2
-++/* [0x00000d30] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-++/* [0x00000d38] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-++/* [0x00000d40] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-++/* [0x00000d48] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-++/* [0x00000d50] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-++/* [0x00000d58] */ 0x13540dc0, 0xd0020867, // max r1, ra_y2, 0
-++/* [0x00000d60] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+ /* [0x00000d68] */ 0x0c541dc0, 0xd0020567, // add ra_y2, ra_y2, 1
-+ /* [0x00000d70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+-/* [0x00000d78] */ 0x0c667380, 0x10020e27, // add t0s, r1, ra_frame_base2
-+-// ::mc_filter
-++/* [0x00000d78] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
-++// :per_block_setup
-+ /* [0x00000d80] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ /* [0x00000d88] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+ /* [0x00000d90] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+ /* [0x00000d98] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+-/* [0x00000da0] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000da8] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00000db0] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000db8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00000dc0] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00000dc8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000dd0] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+-/* [0x00000dd8] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00000de0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+-/* [0x00000de8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00000df0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x00000df8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+-/* [0x00000e00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00000e08] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+-/* [0x00000e10] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x00000e18] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x00000e20] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e28] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x00000e30] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x00000e38] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00000e40] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00000e48] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00000e50] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00000e58] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00000e60] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000e68] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00000e70] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000e78] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000e80] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000e88] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000e90] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000e98] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00000ea0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000ea8] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000eb0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000eb8] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ec0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x00000ec8] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000ed0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000ed8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x00000ee0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000ee8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x00000ef0] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000ef8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f00] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00000f08] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000da0] */ 0x00000010, 0xe00208e7, // mov r3, 16
-++/* [0x00000da8] */ 0x15827d80, 0x10020867, // mov r1, unif
-++/* [0x00000db0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000db8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000dc0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000dc8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-++/* [0x00000dd0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-++/* [0x00000dd8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000de0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-++/* [0x00000de8] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-++/* [0x00000df0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000df8] */ 0x8c827436, 0x100246a1, // add ra_frame_base_next, r2, r0 ; mov r1, unif
-++/* [0x00000e00] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-++/* [0x00000e08] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-++/* [0x00000e10] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-++/* [0x00000e18] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-++/* [0x00000e20] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-++/* [0x00000e28] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-++/* [0x00000e30] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-++/* [0x00000e38] */ 0x159e7240, 0x10021067, // mov ra_y2_next, r1
-++/* [0x00000e40] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-++/* [0x00000e48] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-++/* [0x00000e50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-++/* [0x00000e58] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00000e60] */ 0x0e9e70c0, 0x10020867, // shr r1, r0, r3
-++/* [0x00000e68] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-++/* [0x00000e70] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-++/* [0x00000e78] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-++/* [0x00000e80] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-++/* [0x00000e88] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-++/* [0x00000e90] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-++/* [0x00000e98] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
-++/* [0x00000ea0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-++/* [0x00000ea8] */ 0x95801dbf, 0xd0024821, // mov r0, unif ; mov r1,1
-++/* [0x00000eb0] */ 0x4f5971c6, 0x10024260, // asr ra9, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000eb8] */ 0x4f5971c6, 0x10024220, // asr ra8, r0, rb23;      mul24 r0, r0, ra22
-++/* [0x00000ec0] */ 0x4f5971c6, 0x10044260, // asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22
-++/* [0x00000ec8] */ 0x0f9d71c0, 0x10040227, // asr.ifz ra8, r0, rb23
-++/* [0x00000ed0] */ 0x0d243f80, 0xd0020267, // sub ra9,3,ra9
-++/* [0x00000ed8] */ 0x0d203f80, 0xd0020227, // sub ra8,3,ra8
-++/* [0x00000ee0] */ 0x11243dc0, 0xd0020267, // shl ra9,ra9,3
-++/* [0x00000ee8] */ 0x11203dc0, 0xd0020227, // shl ra8,ra8,3
-++/* [0x00000ef0] */ 0x00ffff00, 0xe0020867, // mov r1,0xffff00
-++/* [0x00000ef8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000f00] */ 0x0f9d71c0, 0x10020027, // asr ra0, r0, rb23
-++/* [0x00000f08] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-+ /* [0x00000f10] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+-/* [0x00000f18] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00000f20] */ 0x15827d80, 0x100009e7, // mov.ifnz -, unif
-+-/* [0x00000f28] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00000f30] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00000f38] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00000f40] */ 0x00000000, 0xe00208e7, // mov r3, 0
-++/* [0x00000f18] */ 0x01040400, 0xe0020867, // mov r1,0x1040400
-++/* [0x00000f20] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000f28] */ 0x0f9d71c0, 0x10020067, // asr ra1, r0, rb23
-++/* [0x00000f30] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000f38] */ 0x0f9d71c0, 0x10021167, // asr rb5, r0, rb23
-++/* [0x00000f40] */ 0xfbf5f600, 0xe0020867, // mov r1,0xfbf5f600
-++/* [0x00000f48] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000f50] */ 0x0f9d71c0, 0x100200a7, // asr ra2, r0, rb23
-++/* [0x00000f58] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000f60] */ 0x0f9d71c0, 0x100211a7, // asr rb6, r0, rb23
-++/* [0x00000f68] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-++/* [0x00000f70] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000f78] */ 0x0f9d71c0, 0x100200e7, // asr ra3, r0, rb23
-++/* [0x00000f80] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000f88] */ 0x0f9d71c0, 0x100211e7, // asr rb7, r0, rb23
-++/* [0x00000f90] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-++/* [0x00000f98] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000fa0] */ 0x0f9d71c0, 0x10020127, // asr ra4, r0, rb23
-++/* [0x00000fa8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000fb0] */ 0x0f9d71c0, 0x10021227, // asr rb8, r0, rb23
-++/* [0x00000fb8] */ 0xf6f5fb00, 0xe0020867, // mov r1,0xf6f5fb00
-++/* [0x00000fc0] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000fc8] */ 0x0f9d71c0, 0x10020167, // asr ra5, r0, rb23
-++/* [0x00000fd0] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00000fd8] */ 0x0f9d71c0, 0x10021267, // asr rb9, r0, rb23
-++/* [0x00000fe0] */ 0x04040100, 0xe0020867, // mov r1,0x4040100
-++/* [0x00000fe8] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00000ff0] */ 0x0f9d71c0, 0x100201a7, // asr ra6, r0, rb23
-++/* [0x00000ff8] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00001000] */ 0x0f9d71c0, 0x100212a7, // asr rb10, r0, rb23
-++/* [0x00001008] */ 0xffff0000, 0xe0020867, // mov r1,0xffff0000
-++/* [0x00001010] */ 0x11227380, 0x10020827, // shl r0, r1, ra8
-++/* [0x00001018] */ 0x0f9d71c0, 0x100201e7, // asr ra7, r0, rb23
-++/* [0x00001020] */ 0x11267380, 0x10020827, // shl r0, r1, ra9
-++/* [0x00001028] */ 0x0f9d71c0, 0x100212e7, // asr rb11, r0, rb23
-++/* [0x00001030] */ 0x15827d80, 0x10020827, // mov r0, unif
-++/* [0x00001038] */ 0x0f9e70c0, 0x100213e7, // asr rb15, r0, r3
-++/* [0x00001040] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-++/* [0x00001048] */ 0x119e70c0, 0x10020827, // shl r0, r0, r3
-++/* [0x00001050] */ 0x8f9c00ff, 0xd0024823, // asr r0, r0, r3 ; mov r3, 0
-++/* [0x00001058] */ 0x119c81c0, 0xd00213a7, // shl rb14, r0, 8
-++// ::mc_filter
-+ // :yloop
-+-/* [0x00000f48] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-/* [0x00000f50] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-/* [0x00000f58] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00000f60] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00000f68] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+-/* [0x00000f70] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+-/* [0x00000f78] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00000f80] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000f88] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-/* [0x00000f90] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+-/* [0x00000f98] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+-/* [0x00000fa0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00000fa8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-/* [0x00000fb0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+-/* [0x00000fb8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00000fc0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x00000fc8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x00000fd0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x00000fd8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x00000fe0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x00000fe8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x00000ff0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x00000ff8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00001000] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00001008] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00001010] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00001018] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00001020] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001028] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001030] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001038] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001040] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001048] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+-/* [0x00001050] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001058] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001060] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001068] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001070] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-+-/* [0x00001078] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001080] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00001088] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00001090] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00001098] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000010a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000010a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000010b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-/* [0x000010b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-/* [0x000010c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-/* [0x000010c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-/* [0x000010d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000010d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000010e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000010e8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+-/* [0x000010f0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+-/* [0x000010f8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+-/* [0x00001100] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-+-/* [0x00001108] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-+-/* [0x00001110] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00001118] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00001120] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001128] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001130] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001138] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00001060] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00001068] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-++/* [0x00001070] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00001078] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001080] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00001088] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00001090] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001098] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000010a0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x000010a8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x000010b0] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x000010b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000010c0] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x000010c8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-++/* [0x000010d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000010d8] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000010e0] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000010e8] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000010f0] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000010f8] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x00001100] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001108] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001110] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001118] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001120] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001128] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001130] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001138] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001140] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001148] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001150] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001158] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001160] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
-++/* [0x00001168] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001170] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001178] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001180] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001188] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001190] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001198] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x000011a0] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000011a8] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000011b0] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000011b8] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000011c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000011c8] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000011d0] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000011d8] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000011e0] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000011e8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000011f0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000011f8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-++/* [0x00001200] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-++/* [0x00001208] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-++/* [0x00001210] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-++/* [0x00001218] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloop
-++/* [0x00001220] */ 0x0c9cf3c0, 0x10020867, // add r1, r1, rb15
-++/* [0x00001228] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001230] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001238] */ 0xfffffb28, 0xf0f809e7, // brr -, r:per_block_setup
-++/* [0x00001240] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001248] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001250] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_filter_b
-+-/* [0x00001140] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x00001148] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+-/* [0x00001150] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+-/* [0x00001158] */ 0x155e7d80, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+-/* [0x00001160] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x00001168] */ 0x938001f6, 0xd0024821, // max r0, r0, 0; mov r1, unif
-+-/* [0x00001170] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x00001178] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+-/* [0x00001180] */ 0x159e7240, 0x10020727, // mov ra_y_next, r1
-+-/* [0x00001188] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x00001190] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+-/* [0x00001198] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+-/* [0x000011a0] */ 0x938001f6, 0xd0024821, // max r0, r0, 0   ; mov r1, unif
-+-/* [0x000011a8] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+-/* [0x000011b0] */ 0x119c31c0, 0xd00205e7, // shl rx_xshift2_next, r0, 3
-+-/* [0x000011b8] */ 0x0c9c13c0, 0xd0021067, // add ra_y2_next, r1, 1
-+-/* [0x000011c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+-/* [0x000011c8] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+-/* [0x000011d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+-/* [0x000011d8] */ 0x00000010, 0xe00208a7, // mov r2, 16
-+-/* [0x000011e0] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x000011e8] */ 0x0e9e7080, 0x10020867, // shr r1, r0, r2
-+-/* [0x000011f0] */ 0x0d9d8e40, 0x10021767, // sub rb29, rb24, r1
-+-/* [0x000011f8] */ 0x149d61c0, 0x10020827, // and r0, r0, rb22
-+-/* [0x00001200] */ 0x0c9c51c0, 0xd0021467, // add rb17, r0, 5
-+-/* [0x00001208] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+-/* [0x00001210] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7
-+-/* [0x00001218] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+-/* [0x00001220] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001228] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+-/* [0x00001230] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001238] */ 0x00000001, 0xe0020867, // mov r1, 1
-+-/* [0x00001240] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001248] */ 0x409f3001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 13, r1 << 13
-+-/* [0x00001250] */ 0x4f5971c6, 0x100240e0, // asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001258] */ 0x409f2001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 14, r1 << 14
-+-/* [0x00001260] */ 0x4f5971c6, 0x100240a0, // asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001268] */ 0x409f1001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 15, r1 << 15
-+-/* [0x00001270] */ 0x4f5971c6, 0x10024060, // asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001278] */ 0x8f8171f6, 0x10024020, // asr ra0, r0, rb23;      mov r0, unif
-+-/* [0x00001280] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001288] */ 0x409f7001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 9, r1 << 9
-+-/* [0x00001290] */ 0x4f5971c6, 0x100241e0, // asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001298] */ 0x409f6001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 10, r1 << 10
-+-/* [0x000012a0] */ 0x4f5971c6, 0x100241a0, // asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012a8] */ 0x409f5001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 11, r1 << 11
-+-/* [0x000012b0] */ 0x4f5971c6, 0x10024160, // asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012b8] */ 0x409f4001, 0xd00049e0, // nop              ;      mul24 r0, r0 << 12, r1 << 12
-+-/* [0x000012c0] */ 0x8f8171f6, 0x10024120, // asr ra4, r0, rb23;      mov r0, unif
-+-/* [0x000012c8] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x000012d0] */ 0x4f5971c6, 0x100252e0, // asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000012d8] */ 0x4f5971c6, 0x100252a0, // asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-/* [0x000012e0] */ 0x4f5971c6, 0x10025260, // asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x000012e8] */ 0x8f8171f6, 0x10025220, // asr rb8, r0, rb23;      mov r0, unif
-+-/* [0x000012f0] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x000012f8] */ 0x4f5971c6, 0x100251e0, // asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001300] */ 0x4f5971c6, 0x100251a0, // asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001308] */ 0x4f5971c6, 0x10025160, // asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-/* [0x00001310] */ 0x0f9d71c0, 0x10021127, // asr rb4, r0, rb23
-+-/* [0x00001318] */ 0x15827d80, 0x10020827, // mov r0, unif
-+-/* [0x00001320] */ 0x15827d80, 0x10060827, // mov.ifnz r0, unif
-+-/* [0x00001328] */ 0x0f9e7080, 0x100213e7, // asr rb15, r0, r2
-+-/* [0x00001330] */ 0x119e7080, 0x10020827, // shl r0, r0, r2
-+-/* [0x00001338] */ 0x0f9e7080, 0x100213a7, // asr rb14, r0, r2
-+-/* [0x00001340] */ 0x00000000, 0xe00208e7, // mov r3, 0
-+ // :yloopb
-+-/* [0x00001348] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-/* [0x00001350] */ 0x8e4539bf, 0xa0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-/* [0x00001358] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-/* [0x00001360] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-/* [0x00001368] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-+-/* [0x00001370] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-+-/* [0x00001378] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+-/* [0x00001380] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x00001388] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-/* [0x00001390] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+-/* [0x00001398] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-+-/* [0x000013a0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+-/* [0x000013a8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-/* [0x000013b0] */ 0xec654c87, 0x10024e20, // add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-+-/* [0x000013b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-/* [0x000013c0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-+-/* [0x000013c8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-/* [0x000013d0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-/* [0x000013d8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-/* [0x000013e0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-/* [0x000013e8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-/* [0x000013f0] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-/* [0x000013f8] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-/* [0x00001400] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-/* [0x00001408] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-/* [0x00001410] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-/* [0x00001418] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-/* [0x00001420] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-/* [0x00001428] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-/* [0x00001430] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-/* [0x00001438] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-/* [0x00001440] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-+-/* [0x00001448] */ 0x8d3487f6, 0xd00279cc, // sub.setf -, r3, 8    ; mov ra12, ra13
-+-/* [0x00001450] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-+-/* [0x00001458] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-+-/* [0x00001460] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-+-/* [0x00001468] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-+-/* [0x00001470] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-+-/* [0x00001478] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-+-/* [0x00001480] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+-/* [0x00001488] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-+-/* [0x00001490] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-+-/* [0x00001498] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-+-/* [0x000014a0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-/* [0x000014a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-/* [0x000014b0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-/* [0x000014b8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-/* [0x000014c0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-/* [0x000014c8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-/* [0x000014d0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+-/* [0x000014d8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-/* [0x000014e0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+-/* [0x000014e8] */ 0x4053800e, 0xd00049e1, // nop                     ; mul24 r1, r1 << 8, ra20 << 8
-+-/* [0x000014f0] */ 0x4c78e38f, 0x10024860, // add r1, r1, ra30        ; mul24 r0, r1, rb14
-+-/* [0x000014f8] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-+-/* [0x00001500] */ 0xfffffe28, 0xf06809e7, // brr.anyn -, r:yloopb
-+-/* [0x00001508] */ 0x0f9c73c0, 0xd0020867, // asr r1, r1, 7
-+-/* [0x00001510] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-+-/* [0x00001518] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+-/* [0x00001520] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+-/* [0x00001528] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+-/* [0x00001530] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+-/* [0x00001538] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-++/* [0x00001258] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++/* [0x00001260] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-++/* [0x00001268] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++/* [0x00001270] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++/* [0x00001278] */ 0x0e9c09c0, 0x10020867, // shr r1, r4, rx_xshift2
-++/* [0x00001280] */ 0x159c1fc0, 0x10040567, // mov.ifz ra_y2, ra_y2_next
-++/* [0x00001288] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-++/* [0x00001290] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x00001298] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++/* [0x000012a0] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-++/* [0x000012a8] */ 0x13540dc0, 0xd00208a7, // max r2, ra_y2, 0
-++/* [0x000012b0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-++/* [0x000012b8] */ 0x4c541dd3, 0xd0024562, // add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++/* [0x000012c0] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-++/* [0x000012c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++/* [0x000012d0] */ 0x40027006, 0x100049e2, // nop                  ; mul24 r2, r0, ra0
-++/* [0x000012d8] */ 0x40038031, 0xd000c9e2, // nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++/* [0x000012e0] */ 0x4007f030, 0xd00049e3, // nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++/* [0x000012e8] */ 0x40077031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++/* [0x000012f0] */ 0x4c0be4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++/* [0x000012f8] */ 0x400b6031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++/* [0x00001300] */ 0x4c0fd4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++/* [0x00001308] */ 0x400f5031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++/* [0x00001310] */ 0x4c13c4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++/* [0x00001318] */ 0x40134031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++/* [0x00001320] */ 0x4c17b4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++/* [0x00001328] */ 0x40173031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++/* [0x00001330] */ 0x4c1ba4f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++/* [0x00001338] */ 0x401b2031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++/* [0x00001340] */ 0x4c1f94f0, 0xd00248a3, // add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++/* [0x00001348] */ 0x401f1031, 0xd000c9e3, // nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++/* [0x00001350] */ 0x8c9df4ff, 0x10024823, // add r0, r2, r3       ; mov r3, rb31
-++/* [0x00001358] */ 0x8d2487f6, 0xd00279c8, // sub.setf -, r3, 8    ; mov ra8, ra9
-++/* [0x00001360] */ 0x152a7d80, 0x10020267, // mov ra9, ra10
-++/* [0x00001368] */ 0x152e7d80, 0x100202a7, // mov ra10, ra11
-++/* [0x00001370] */ 0x15327d80, 0x100202e7, // mov ra11, ra12
-++/* [0x00001378] */ 0x15367d80, 0x10020327, // mov ra12, ra13
-++/* [0x00001380] */ 0xfffffeb8, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001388] */ 0x153a7d80, 0x10020367, // mov ra13, ra14
-++/* [0x00001390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-++/* [0x00001398] */ 0x159e7000, 0x100203e7, // mov ra15, r0
-++/* [0x000013a0] */ 0x4038a037, 0x100049e1, // nop                     ; mul24 r1, ra14, rb10
-++/* [0x000013a8] */ 0x40349037, 0x100049e0, // nop                     ; mul24 r0, ra13, rb9
-++/* [0x000013b0] */ 0x4c308237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra12, rb8
-++/* [0x000013b8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-++/* [0x000013c0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8, rb4
-++/* [0x000013c8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9, rb5
-++/* [0x000013d0] */ 0x4c286237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra10, rb6
-++/* [0x000013d8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-++/* [0x000013e0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-++/* [0x000013e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++/* [0x000013f0] */ 0x0f9ce3c0, 0xd0020827, // asr r0, r1, 14
-++/* [0x000013f8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-++/* [0x00001400] */ 0x405b8006, 0xd00049e0, // nop                     ; mul24 r0, r0 << 8, ra22 << 8
-++/* [0x00001408] */ 0x0c4a7380, 0x10020867, // add r1, r1, ra18
-++/* [0x00001410] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
-++/* [0x00001418] */ 0xfffffe20, 0xf06809e7, // brr.anyn -, r:yloopb
-++/* [0x00001420] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, 15
-++/* [0x00001428] */ 0x129d63c0, 0x10020867, // min r1, r1, rb22
-++/* [0x00001430] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-++/* [0x00001438] */ 0xfffff928, 0xf0f809e7, // brr -, r:per_block_setup
-++/* [0x00001440] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-++/* [0x00001448] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-++/* [0x00001450] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+ // ::mc_interrupt_exit12
-+-/* [0x00001540] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+-/* [0x00001548] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001550] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001558] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001560] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+-/* [0x00001568] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001570] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001578] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001580] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001588] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001590] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x00001598] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+-/* [0x000015c0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+-/* [0x000015c8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+-/* [0x000015d0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++/* [0x00001458] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x00001460] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001468] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001470] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00001478] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00001480] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001488] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001490] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x00001498] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-++/* [0x000014d8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x000014e0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x000014e8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-++// ::mc_exit1
-++/* [0x000014f0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-++/* [0x000014f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001500] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00001508] */ 0x009e7000, 0xa00009e7, // ldtmu0
-++/* [0x00001510] */ 0x009e7000, 0xb00009e7, // ldtmu1
-++/* [0x00001518] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-++/* [0x00001520] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-++/* [0x00001528] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+ // ::mc_end
-+ };
-+ #ifdef __HIGHC__
-+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-+index 6e552d9..760bd17 100644
-+--- a/libavcodec/rpi_shader.h
-++++ b/libavcodec/rpi_shader.h
-+@@ -4,15 +4,16 @@
-+ extern unsigned int rpi_shader[];
-+ 
-+ #define mc_setup_uv (rpi_shader + 0)
-+-#define mc_filter_uv (rpi_shader + 144)
-+-#define mc_filter_uv_b0 (rpi_shader + 334)
-+-#define mc_filter_uv_b (rpi_shader + 486)
-+-#define mc_exit (rpi_shader + 662)
-+-#define mc_interrupt_exit8 (rpi_shader + 680)
-+-#define mc_setup (rpi_shader + 710)
-+-#define mc_filter (rpi_shader + 864)
-+-#define mc_filter_b (rpi_shader + 1104)
-+-#define mc_interrupt_exit12 (rpi_shader + 1360)
-+-#define mc_end (rpi_shader + 1398)
-++#define mc_filter_uv (rpi_shader + 130)
-++#define mc_filter_uv_b0 (rpi_shader + 312)
-++#define mc_filter_uv_b (rpi_shader + 464)
-++#define mc_exit (rpi_shader + 640)
-++#define mc_interrupt_exit8 (rpi_shader + 658)
-++#define mc_setup (rpi_shader + 688)
-++#define mc_filter (rpi_shader + 1048)
-++#define mc_filter_b (rpi_shader + 1174)
-++#define mc_interrupt_exit12 (rpi_shader + 1302)
-++#define mc_exit1 (rpi_shader + 1340)
-++#define mc_end (rpi_shader + 1356)
-+ 
-+ #endif
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index a0b8e5a..60d1ec2 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -21,6 +21,7 @@
-+ #
-+ # ra16                                          clipped(row start address+elem_num)&~3
-+ # ra17                                          per-channel shifts
-++# ra18                                          0x4000
-+ # ra19                                          next ra17
-+ #
-+ # rb16                                          pitch
-+@@ -86,7 +87,7 @@
-+ 
-+ 
-+ ################################################################################
-+-# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, pad0, pad1, pad2)
-++# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
-+ ::mc_setup_uv
-+ 
-+ # Read starting kernel
-+@@ -132,36 +133,6 @@ mov ra13, 0
-+ mov ra14, 0
-+ mov ra15, 0
-+ 
-+-# Compute part of VPM to use for DMA output
-+-mov r3, unif
-+-shl r2, r3, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+-and r2, r2, 15
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+-shl r0, r0, 5
-+-add rb27, r0, r1
-+-
-+-# Compute part of VPM to save data into
-+-shl r2, r3, 1
-+-and r2, r2, 15    # r2 = bcd0
-+-mov r1, r2        # r1 = bcd0
-+-asr r1, r1, 2     # r1 = bc
-+-shl r1, r1, 6     # r1 = bc000000
-+-mov r0, r2        # r0 = bcd0
-+-and r0, r0, 3     # r0 = d0
-+-add r0, r0, r1    # r0 = bc0000d0
-+-mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+-add rb28, r0, r1
-+-asr r0, r0, 1     # r0 = bc0000d
-+-# Prepare VPM command for 16bit intermediates
-+-mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-+-add rb21, r0, r1
-+-
-+ # Compute base address for first and second access
-+ mov r0, ra_x           # Load x
-+ max r0, r0, 0; mov r1, ra_y # Load y
-+@@ -175,10 +146,31 @@ min r1, r1, rb_frame_height_minus_1
-+ # submit texture requests for first line
-+ add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+ add t0s, r0, r1 ; mov ra_frame_base, r2
-+-add t0s, r2, r1
-++add t1s, r2, r1
-++
-++mov r2,8
-++shl rb12,unif, r2 # offset before shift
-++add rb13,unif,r2  # offset after shift
-++
-++# Compute part of VPM to use for DMA output
-++mov r2, unif
-++shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-++and r2, r2, 15
-++mov r1, r2
-++asr r1, r1, 2
-++shl r1, r1, 6
-++mov r0, r2
-++and r0, r0, 3
-++add r0, r0, r1
-+ 
-+-mov rb12,unif # offset before shift
-+-mov rb13,unif # offset after shift
-++mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-++add rb28, r0, r1  # VPM 8bit storage
-++asr r2, r0, 1     # r0 = bc0000d
-++mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-++add rb21, r2, r1  # VPM for 16bit intermediates
-++mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++shl r0, r0, 5
-++add rb27, r0, r1  # DMA out
-+ 
-+ # submit texture requests for second line
-+ max r1, ra_y, 0
-+@@ -187,7 +179,7 @@ add ra_y, ra_y, 1
-+ bra -, ra31
-+ nop ; mul24 r1, r1, rb_pitch
-+ add t0s, r1, ra_x
-+-add t0s, r1, ra_frame_base
-++add t1s, r1, ra_frame_base
-+ 
-+ 
-+ 
-+@@ -248,17 +240,15 @@ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ mov r0, unif # U offset/weight
-+ asr rb15, r0, r2  # Compute offset from MSBs
-+ shl r0, r0, r2
-+-asr rb14, r0, r2  # Compute weight from LSBs
-++asr r3, r0, r2  # Compute weight from LSBs
-+ mov r0, unif # V offset/weight
-+ asr.ifnz rb15, r0, r2
-+ shl r0, r0, r2
-+-asr.ifnz rb14, r0, r2
-++asr.ifnz r3, r0, r2
-++shl rb14,r3,8 # Scale up weights so we can use mul24 in signed fashion
-+ 
-+ # r2 is elem_num
-+ # r3 is loop counter
-+-
-+-mov r5rep, -8
-+-
-+ # retrieve texture results and pick out bytes
-+ # then submit two more texture requests
-+ 
-+@@ -269,7 +259,7 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+@@ -278,7 +268,7 @@ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+ add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_frame_base, r2
-++add t1s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -301,11 +291,6 @@ mov ra13, ra14       # Delay slot 1
-+ mov ra14, ra15       # Delay slot 2
-+ mov ra15, r0         # Delay slot 3
-+ 
-+-mov rb12,32 # TODO remove these to make P weighted prediction work properly
-+-mov rb13,6
-+-mov rb14,1
-+-mov rb15,0
-+-
-+ # apply vertical filter and write to VPM
-+ 
-+ nop                     ; mul24 r1, ra14, rb10
-+@@ -412,7 +397,7 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+@@ -421,7 +406,7 @@ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+ add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_frame_base, r2
-++add t1s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -542,7 +527,7 @@ mov r3, 0
-+ # then submit two more texture requests
-+ 
-+ sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                     ; ldtmu0     # loop counter increment
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu0
-++shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+ mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+ shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
-+@@ -551,7 +536,7 @@ max r2, ra_y, 0  # y
-+ min r2, r2, rb_frame_height_minus_1
-+ add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+ add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+-add t0s, ra_frame_base, r2
-++add t1s, ra_frame_base, r2
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+@@ -617,9 +602,9 @@ mov  -, vw_wait # wait on the VDW
-+ mov -,srel(0)
-+ 
-+ ldtmu0
-++ldtmu1
-+ ldtmu0
-+-ldtmu0
-+-ldtmu0
-++ldtmu1
-+ 
-+ nop        ; nop ; thrend
-+ nop        ; nop # delay slot 1
-+@@ -630,9 +615,9 @@ nop        ; nop # delay slot 2
-+ mov  -, vw_wait # wait on the VDW
-+ 
-+ ldtmu0
-++ldtmu1
-+ ldtmu0
-+-ldtmu0
-+-ldtmu0
-++ldtmu1
-+ 
-+ mov -,sacq(0) # 1
-+ mov -,sacq(0) # 2
-+@@ -656,200 +641,249 @@ nop        ; nop # delay slot 2
-+ # For P frames we make the second x,y coordinates offset by +8
-+ 
-+ ################################################################################
-+-# mc_setup(next_kernel, x, y, ref_y_base, x2, y2, ref_y2_base, frame_width, frame_height, pitch, dst_pitch, offset, shift, pad2)
-++# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+ ::mc_setup
-++  mov r3, 16
-+ 
-+-# Read starting kernel
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-mov ra31, unif
-+-
-+-# Compute base address for first and second access
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl ra_xshift_next, r0, 3 # Compute shifts
-+-add ra_y, r1, 1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-+-max r1, r1, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+-add t0s, r2, r1 ; mov ra_frame_base, r2
-+-
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl rx_xshift2_next, r0, 3 # Compute shifts
-+-add ra_y2, r1, 1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-+-max r1, r1, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+-add t0s, r2, r1 ; mov ra_frame_base2, r2
-+-
-++  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-++  mov ra8, unif
-++  mov ra9, unif
-++  mov ra10, unif
-++  mov ra11, unif
-+ 
-+ # Read image dimensions
-+-sub rb25,unif,1
-+-sub rb30,unif,1
-++  mov r1, unif # width_height
-++  shl r0,r1,r3
-++  asr r1,r1,r3 # width
-++  asr r0,r0,r3 # height
-++  sub rb_frame_width_minus_1,r1,1
-++  sub rb_frame_height_minus_1,r0,1
-+ 
-+ # get source pitch
-+-mov rb16, unif
-++  mov rb_pitch, unif
-+ 
-+ # get destination pitch
-+-mov r0, unif
-+-mov r1, vdw_setup_1(0)
-+-add rb24, r1, r0
-++  mov r0, unif
-++  mov r1, vdw_setup_1(0)
-++  add rb24, r1, r0
-+ 
-+-# load constants
-+-
-+-mov ra20, 1
-+-mov ra22, 256
-+-mov ra30, 64
-+-
-+-mov rb20, 0xffffff00
-+-mov rb22, 255
-+-mov rb23, 24
-++# Compute base address for first and second access
-++  mov r1, ra8 # y_x
-++  shl r0,r1,r3 # r0 is x<<16
-++  asr r1,r1,r3 # r1 is y
-++  asr r0,r0,r3 # r0 is x
-++  add r0, r0, elem_num # Load x
-++  max r0, r0, 0
-++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
-++  shl ra_xshift_next, r0, 3 # Compute shifts
-++  add ra_y, r1, 1
-++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-++  max r1, r1, 0
-++  min r1, r1, rb_frame_height_minus_1
-++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-++  add t0s, r2, r1 ; mov ra_frame_base, r2
-++
-++  mov r1, ra10 # y_x
-++  shl r0,r1,r3 # r0 is x<<16
-++  asr r1,r1,r3 # r1 is y
-++  asr r0,r0,r3 # r0 is x
-++  add r0, r0, elem_num # Load x
-++  max r0, r0, 0
-++  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
-++  shl rx_xshift2_next, r0, 3 # Compute shifts
-++  add ra_y2, r1, 1
-++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-++  max r1, r1, 0
-++  min r1, r1, rb_frame_height_minus_1
-++  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-++  add t1s, r2, r1 ; mov ra_frame_base2, r2
-+ 
-+-# touch vertical context to keep simulator happy
-+ 
-+-mov ra8, 0
-+-mov ra9, 0
-+-mov ra10, 0
-+-mov ra11, 0
-+-mov ra12, 0
-+-mov ra13, 0
-+-mov ra14, 0
-+-mov ra15, 0
-++# load constants
-+ 
-+-# Compute part of VPM to use for DMA output
-+-mov r2, qpu_num
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+-shl r0, r0, 5
-+-add rb27, r0, r1
-++  mov ra20, 1
-++  mov ra22, 256
-++  mov ra30, 64
-+ 
-+-# Compute part of VPM to save data into
-+-mov r2, qpu_num   # qpu_num = abcd
-+-mov r1, r2
-+-asr r1, r1, 2
-+-shl r1, r1, 6
-+-mov r0, r2
-+-and r0, r0, 3
-+-add r0, r0, r1
-+-mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+-add rb28, r0, r1
-++  mov rb20, 0xffffff00
-++  mov rb22, 255
-++  mov rb23, 24
-+ 
-+-mov rb12,unif # offset before shift
-+-mov rb13,unif # shift
-++# touch vertical context to keep simulator happy
-+ 
-+-# Dump padding words
-+-mov r0, unif
-++  mov ra8, 0
-++  mov ra9, 0
-++  mov ra10, 0
-++  mov ra11, 0
-++  mov ra12, 0
-++  mov ra13, 0
-++  mov ra14, 0
-++  mov ra15, 0
-++  mov ra18, 0x4000
-++
-++# Compute part of VPM to use
-++  mov r2, qpu_num
-++  mov r1, r2
-++  asr r1, r1, 2
-++  shl r1, r1, 6
-++  mov r0, r2
-++  and r0, r0, 3
-++  add r0, r0, r1
-++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-++  add rb28, r0, r1  # VPM for saving data
-++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-++  shl r0, r0, 5
-++  add rb27, r0, r1  # Command for dma output
-++
-++# Weighted prediction denom
-++
-++  mov r1, unif # offset_shift
-++  shl r0,r1,r3 ; mov r2,8
-++  asr rb13,r0,r3 # shift
-++  asr rb12,r1,r3 # offset
-++  add rb13,rb13,r2    # mul24 is unsigned so scale up into high bits
-++  shl rb12, rb12, r2 # Account for larger shift
-+ 
-+ # submit texture requests for second line
-+-max r1, ra_y, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1
-+-nop ; mul24 r1, r1, rb_pitch
-+-add t0s, r1, ra_frame_base
-+-
-+-max r1, ra_y2, 0
-+-min r1, r1, rb_frame_height_minus_1
-+-bra -, ra31
-+-add ra_y2, ra_y2, 1           # Delay 1
-+-nop ; mul24 r1, r1, rb_pitch  # Delay 2
-+-add t0s, r1, ra_frame_base2   # Delay 3
-+-
-+-
-+-################################################################################
-+-
-+-# mc_filter(next_kernel, x, y, frame_base, x2, y2, frame_base2, height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-+-# In a P block, only the first half of coefficients contain used information.
-+-# At this point we have already issued two pairs of texture requests for the current block
-+-# ra_x, ra_x16_base point to the current coordinates for this block
-+-::mc_filter
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-mov ra31, unif
-++  max r1, ra_y, 0
-++  min r1, r1, rb_frame_height_minus_1
-++  add ra_y, ra_y, 1
-++  nop ; mul24 r1, r1, rb_pitch
-++  add t0s, r1, ra_frame_base
-++
-++  max r1, ra_y2, 0
-++  min r1, r1, rb_frame_height_minus_1
-++  add ra_y2, ra_y2, 1
-++  nop ; mul24 r1, r1, rb_pitch
-++  add t1s, r1, ra_frame_base2
-++
-++# FALL THROUGHT TO PER-BLOCK SETUP
-++
-++# Start of per-block setup code
-++# P and B blocks share the same setup code to save on Icache space
-++:per_block_setup
-++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++  mov ra31, unif
-+ 
-+ # per-channel shifts were calculated on the *previous* invocation
-+-
-+-mov ra_xshift, ra_xshift_next
-+-mov rx_xshift2, rx_xshift2_next
-++  mov ra_xshift, ra_xshift_next
-++  mov rx_xshift2, rx_xshift2_next
-+ 
-+ # get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl ra_xshift_next, r0, 3 # Compute shifts
-+-mov ra_y_next, r1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-+-
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0   ; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl rx_xshift2_next, r0, 3 # Compute shifts
-+-add ra_y2_next, r1, 1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-+-
-++  mov r3, 16
-++  mov r1, unif # y_x
-++  shl r0,r1,r3 # r0 is x<<16
-++  asr r1,r1,r3 # r1 is y
-++  asr r0,r0,r3 # r0 is x
-++  add r0, r0, elem_num # Load x
-++  max r0, r0, 0
-++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++  shl ra_xshift_next, r0, 3 # Compute shifts
-++  mov ra_y_next, r1
-++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++  add ra_frame_base_next, r2, r0 ; mov r1, unif # y2_x2
-++
-++  shl r0,r1,r3 # r0 is x2<<16
-++  asr r1,r1,r3 # r1 is y2
-++  asr r0,r0,r3 # r0 is x2
-++  add r0, r0, elem_num # Load x
-++  max r0, r0, 0
-++  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-++  shl rx_xshift2_next, r0, 3 # Compute shifts
-++  mov ra_y2_next, r1
-++  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-++  add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-+ 
-+ # set up VPM write
-+-mov vw_setup, rb28
-++  mov vw_setup, rb28
-+ 
-+ # get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-+-shl r0, r0, 7
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-++  mov r0, unif
-++  shr r1, r0, r3 # Extract width
-++  sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-++  and r0, r0, rb22 # Extract height
-++  add rb17, r0, 5
-++  add rb18, r0, 7
-++  shl r0, r0, 7
-++  add r0, r0, r1 # Combine width and height of destination area
-++  shl r0, r0, r3 # Shift into bits 16 upwards of the vdw_setup0 register
-++  add rb26, r0, rb27
-+ 
-+ # get filter coefficients and discard unused B frame values
-+-mov r0, unif
-+-mov.ifnz -, unif # Alternate coefficients are unused for P frames
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22 # These may need some pre-rotation to be used in B frames correctly
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-mov.ifnz -, unif
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra4, r0, rb23;      mov r0, unif
-+-mov.ifnz -, unif
-+-asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-+-mov.ifnz -, unif
-+-asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb4, r0, rb23
-+-
-+-mov r0, unif # Frame0 offset/weight
-+-mov.ifnz -, unif # Frame1 offset/weight unused
-+-asr rb15, r0, r2  # Compute offset from MSBs
-+-shl r0, r0, r2
-+-asr rb14, r0, r2  # Compute weight from LSBs
-+-
-+-# r3 is loop counter
-++  mov r0, unif ; mov r1,1  # Packed filter offsets, unpack into ra8... (to be used for vertical context later)
-++  asr ra9, r0, rb23;      mul24 r0, r0, ra22 # my2
-++  asr ra8, r0, rb23;      mul24 r0, r0, ra22 # mx2
-++  asr.ifz ra9, r0, rb23;  mul24 r0, r0, ra22 # my:my2
-++  asr.ifz ra8, r0, rb23                      # mx:mx2
-++  sub ra9,3,ra9
-++  sub ra8,3,ra8
-++  shl ra9,ra9,3   # Scale up by 8
-++  shl ra8,ra8,3   # Scale up by 8
-++# Now if we want aligned we have a mul of 1, so put 0 coefficients at the top
-++  mov r1,0xffff00
-++  shl r0, r1, ra8
-++  asr ra0, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb4, r0, rb23
-++
-++  mov r1,0x1040400
-++  shl r0, r1, ra8
-++  asr ra1, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb5, r0, rb23
-++
-++  mov r1,0xfbf5f600
-++  shl r0, r1, ra8
-++  asr ra2, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb6, r0, rb23
-++
-++  mov r1,0x11283a40
-++  shl r0, r1, ra8
-++  asr ra3, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb7, r0, rb23
-++
-++  mov r1,0x3a281100
-++  shl r0, r1, ra8
-++  asr ra4, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb8, r0, rb23
-++
-++  mov r1,0xf6f5fb00
-++  shl r0, r1, ra8
-++  asr ra5, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb9, r0, rb23
-++
-++  mov r1,0x4040100
-++  shl r0, r1, ra8
-++  asr ra6, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb10, r0, rb23
-++
-++  mov r1,0xffff0000
-++  shl r0, r1, ra8
-++  asr ra7, r0, rb23
-++  shl r0, r1, ra9
-++  asr rb11, r0, rb23
-++
-++# Extract weighted prediction information
-++  mov r0, unif      # offset/weight  TODO move up
-++  asr rb15, r0, r3  # Compute offset from MSBs
-++  bra -, ra31
-++  shl r0, r0, r3    #                                                            Delay 1
-++  asr r0, r0, r3 ; mov r3, 0 # Compute weight from LSBs and reset loop counter   Delay 2
-++  shl rb14, r0, 8 # Use a larger shift to avoid unsigned multiply problem        Delay 3
-+ 
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-++################################################################################
-++# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-++# In a P block, y2_x2 should be y_x+8
-++# At this point we have already issued two pairs of texture requests for the current block
-+ 
-+-mov r3, 0
-++::mc_filter
-+ 
-+ :yloop
-+ # retrieve texture results and pick out bytes
-+@@ -858,91 +892,90 @@ mov r3, 0
-+ # If we knew there was no clipping then this code would get simpler.
-+ # Perhaps we could add on the pitch and clip using larger values?
-+ 
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, rx_xshift2
-+-mov.ifz ra_y2, ra_y2_next
-++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++  shr r1, r4, rx_xshift2
-++  mov.ifz ra_y2, ra_y2_next
-+ 
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y2, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++  max r2, ra_y, 0  # y
-++  min r2, r2, rb_frame_height_minus_1
-++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+ 
-++  max r2, ra_y2, 0  # y
-++  min r2, r2, rb_frame_height_minus_1
-++  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+ 
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+ # apply horizontal filter
-+-nop                  ; mul24 r2, r0, ra0
-+-nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-add r0, r2, r3       ; mov r3, rb31
-+-sub.setf -, r3, 8    ; mov ra12, ra13
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-brr.anyn -, r:yloop
-+-mov ra13, ra14       # Delay slot 1
-+-mov ra14, ra15       # Delay slot 2
-+-mov ra15, r0         # Delay slot 3
-++  nop                  ; mul24 r2, r0, ra0
-++  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++  nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++  add r0, r2, r3       ; mov r3, rb31
-++  sub.setf -, r3, 8    ; mov ra8, ra9
-++  mov ra9, ra10
-++  mov ra10, ra11
-++  mov ra11, ra12
-++  mov ra12, ra13
-++  brr.anyn -, r:yloop
-++  mov ra13, ra14       # Delay slot 1
-++  mov ra14, ra15       # Delay slot 2
-++  mov ra15, r0         # Delay slot 3
-+ 
-+ # apply vertical filter and write to VPM
-+ 
-+-nop                     ; mul24 r1, ra14, rb10
-+-nop                     ; mul24 r0, ra13, rb9
-+-add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-
-+-add r1, r1, r0          ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 14
-+-nop                     ; mul24 r1, r1, rb14
-+-add r1, r1, rb12
-+-asr r1, r1, rb13
-+-brr.anyn -, r:yloop
-+-add r1, r1, rb15       # Delay 1
-+-min r1, r1, rb22       # Delay 2
-+-max vpm, r1, 0         # Delay 3
-++  nop                     ; mul24 r1, ra14, rb10
-++  nop                     ; mul24 r0, ra13, rb9
-++  add r1, r1, r0          ; mul24 r0, ra12, rb8
-++  add r1, r1, r0          ; mul24 r0, ra15, rb11
-++  add r1, r1, r0          ; mul24 r0, ra8, rb4
-++  add r1, r1, r0          ; mul24 r0, ra9, rb5
-++  add r1, r1, r0          ; mul24 r0, ra10, rb6
-++  add r1, r1, r0          ; mul24 r0, ra11, rb7
-++
-++  add r1, r1, r0          ; mov -, vw_wait
-++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++  asr r1, r1, 14
-++  nop                     ; mul24 r1, r1, rb14
-++  add r1, r1, rb12
-++  asr r1, r1, rb13
-++  brr.anyn -, r:yloop
-++  add r1, r1, rb15       # Delay 1
-++  min r1, r1, rb22       # Delay 2
-++  max vpm, r1, 0         # Delay 3
-+ 
-+ # DMA out
-+ 
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW setup 0    Delay 1
-+-mov vw_setup, rb29 # Stride         Delay 2
-+-mov vw_addr, unif # start the VDW   Delay 3
-++  brr -, r:per_block_setup
-++  mov vw_setup, rb26 # VDW setup 0    Delay 1
-++  mov vw_setup, rb29 # Stride         Delay 2
-++  mov vw_addr, unif # start the VDW   Delay 3
-+ 
-+ 
-+ 
-+ ################################################################################
-+ 
-+-# mc_filter_b(next_kernel, x, y, frame_base, x2, y2, frame_base2, width_height, hcoeffs[0], hcoeffs2[0], hcoeffs[1], hcoeffs2[1], vcoeffs[0], vcoeffs2[0], vcoeffs[1], vcoeffs2[1], offsetweight0, offsetweight1, this_dst)
-++# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+ # In a P block, only the first half of coefficients contain used information.
-+ # At this point we have already issued two pairs of texture requests for the current block
-+ # May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-+@@ -952,92 +985,6 @@ mov vw_addr, unif # start the VDW   Delay 3
-+ # Or possibly by taking advantage of symmetry?
-+ # From 19->7 32bits per command.
-+ ::mc_filter_b
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+-mov ra31, unif
-+-
-+-# per-channel shifts were calculated on the *previous* invocation
-+-
-+-mov ra_xshift, ra_xshift_next
-+-mov rx_xshift2, rx_xshift2_next
-+-
-+-# get base addresses and per-channel shifts for *next* invocation
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl ra_xshift_next, r0, 3 # Compute shifts
-+-mov ra_y_next, r1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add ra_frame_base_next, r2, r0  # r2 is address for frame0 (not including y offset)
-+-
-+-add r0, unif, elem_num # Load x
-+-max r0, r0, 0   ; mov r1, unif # Load y
-+-min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+-shl rx_xshift2_next, r0, 3 # Compute shifts
-+-add ra_y2_next, r1, 1
-+-and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+-add rx_frame_base2_next, r2, r0  # r2 is address for frame1 (not including y offset)
-+-
-+-
-+-# set up VPM write
-+-mov vw_setup, rb28
-+-
-+-# get width,height of block
-+-mov r2, 16
-+-mov r0, unif
-+-shr r1, r0, r2 # Extract width
-+-sub rb29, rb24, r1 # Compute vdw_setup1(dst_pitch-width)
-+-and r0, r0, rb22 # Extract height
-+-add rb17, r0, 5
-+-add rb18, r0, 7
-+-shl r0, r0, 7
-+-add r0, r0, r1 # Combine width and height of destination area
-+-shl r0, r0, r2 # Shift into bits 16 upwards of the vdw_setup0 register
-+-add rb26, r0, rb27
-+-
-+-# get filter coefficients and discard unused B frame values
-+-mov r0, unif
-+-mov r1, 1
-+-mov.ifnz r0, unif # Alternate coefficients are unused for P frames
-+-nop              ;      mul24 r0, r0 << 13, r1 << 13
-+-asr ra3, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 14, r1 << 14
-+-asr ra2, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 15, r1 << 15 # Adjust such that a rotate of 1 will produce the values with first 8 on left, second 8 on right
-+-asr ra1, r0, rb23;      mul24 r0, r0, ra22
-+-asr ra0, r0, rb23;      mov r0, unif
-+-mov.ifnz r0, unif
-+-nop              ;      mul24 r0, r0 << 9, r1 << 9
-+-asr ra7, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 10, r1 << 10
-+-asr ra6, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 11, r1 << 11
-+-asr ra5, r0, rb23;      mul24 r0, r0, ra22
-+-nop              ;      mul24 r0, r0 << 12, r1 << 12
-+-asr ra4, r0, rb23;      mov r0, unif
-+-mov.ifnz r0, unif
-+-asr rb11, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb10, r0, rb23;     mul24 r0, r0, ra22
-+-asr rb9, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb8, r0, rb23;      mov r0, unif
-+-mov.ifnz r0, unif
-+-asr rb7, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb6, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb5, r0, rb23;      mul24 r0, r0, ra22
-+-asr rb4, r0, rb23
-+-
-+-mov r0, unif # Frame0 offset/weight
-+-mov.ifnz r0, unif # Frame1 offset/weight unused
-+-asr rb15, r0, r2  # Compute offset from MSBs
-+-shl r0, r0, r2
-+-asr rb14, r0, r2  # Compute weight from LSBs
-+-
-+-# r3 is loop counter
-+-
-+-# retrieve texture results and pick out bytes
-+-# then submit two more texture requests
-+-
-+-mov r3, 0
-+-
-+ :yloopb
-+ # retrieve texture results and pick out bytes
-+ # then submit two more texture requests
-+@@ -1045,111 +992,123 @@ mov r3, 0
-+ # If we knew there was no clipping then this code would get simpler.
-+ # Perhaps we could add on the pitch and clip using larger values?
-+ 
-+-sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-+-shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu0
-+-mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+-mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+-shr r1, r4, rx_xshift2
-+-mov.ifz ra_y2, ra_y2_next
-++  sub.setf -, r3, rb17      ; v8adds r3, r3, ra20                            ; ldtmu0
-++  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-++  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-++  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-++  shr r1, r4, rx_xshift2
-++  mov.ifz ra_y2, ra_y2_next
-+ 
-+-max r2, ra_y, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+-add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+-
-+-max r2, ra_y2, 0  # y
-+-min r2, r2, rb_frame_height_minus_1
-+-add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-+-add t0s, ra_frame_base2, r2   ; v8subs r0, r0, rb20
-++  max r2, ra_y, 0  # y
-++  min r2, r2, rb_frame_height_minus_1
-++  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-++  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
-+ 
-++  max r2, ra_y2, 0  # y
-++  min r2, r2, rb_frame_height_minus_1
-++  add ra_y2, ra_y2, 1            ; mul24 r2, r2, r3
-++  add t1s, ra_frame_base2, r2   ; v8subs r1, r1, rb20
-+ 
-+ # generate seven shifted versions
-+ # interleave with scroll of vertical context
-+ 
-+-mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ 
-+ # apply horizontal filter
-+-nop                  ; mul24 r2, r0, ra0
-+-nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-+-nop                  ; mul24      r3, ra1 << 1, r0 << 1
-+-nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-+-add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-+-nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-+-add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-+-nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-+-add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-+-nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-+-add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-+-nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-+-add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-+-nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-+-add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-+-nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-+-add r0, r2, r3       ; mov r3, rb31
-+-sub.setf -, r3, 8    ; mov ra12, ra13
-+-mov ra9, ra10
-+-mov ra10, ra11
-+-mov ra11, ra12
-+-mov ra12, ra13
-+-brr.anyn -, r:yloopb
-+-mov ra13, ra14       # Delay slot 1
-+-mov ra14, ra15       # Delay slot 2
-+-mov ra15, r0         # Delay slot 3
-+-
-+-# apply vertical filter and write to VPM
-+-
-+-nop                     ; mul24 r1, ra14, rb10
-+-nop                     ; mul24 r0, ra13, rb9
-+-add r1, r1, r0          ; mul24 r0, ra12, rb8
-+-add r1, r1, r0          ; mul24 r0, ra15, rb11
-+-add r1, r1, r0          ; mul24 r0, ra8, rb4
-+-add r1, r1, r0          ; mul24 r0, ra9, rb5
-+-add r1, r1, r0          ; mul24 r0, ra10, rb6
-+-add r1, r1, r0          ; mul24 r0, ra11, rb7
-+-
-+-add r1, r1, r0          ; mov -, vw_wait
-+-sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-+-asr r1, r1, 14
-+-nop                     ; mul24 r1, r1 << 8, ra20 << 8 # Rotate to align left and right halves
-+-add r1, r1, ra30        ; mul24 r0, r1, rb14
-+-add r1, r1, r0
-+-brr.anyn -, r:yloopb
-+-asr r1, r1, 7          # Delay 1
-+-min r1, r1, rb22       # Delay 2
-+-max vpm, r1, 0         # Delay 3
-++  nop                  ; mul24 r2, r0, ra0
-++  nop                  ; mul24.ifnz r2, ra0 << 8, r1 << 8
-++  nop                  ; mul24      r3, ra1 << 1, r0 << 1
-++  nop                  ; mul24.ifnz r3, ra1 << 9, r1 << 9
-++  add r2, r2, r3       ; mul24    r3, ra2 << 2, r0 << 2
-++  nop                  ; mul24.ifnz r3, ra2 << 10, r1 << 10
-++  add r2, r2, r3       ; mul24    r3, ra3 << 3, r0 << 3
-++  nop                  ; mul24.ifnz r3, ra3 << 11, r1 << 11
-++  add r2, r2, r3       ; mul24    r3, ra4 << 4, r0 << 4
-++  nop                  ; mul24.ifnz r3, ra4 << 12, r1 << 12
-++  add r2, r2, r3       ; mul24    r3, ra5 << 5, r0 << 5
-++  nop                  ; mul24.ifnz r3, ra5 << 13, r1 << 13
-++  add r2, r2, r3       ; mul24    r3, ra6 << 6, r0 << 6
-++  nop                  ; mul24.ifnz r3, ra6 << 14, r1 << 14
-++  add r2, r2, r3       ; mul24    r3, ra7 << 7, r0 << 7
-++  nop                  ; mul24.ifnz r3, ra7 << 15, r1 << 15
-++  add r0, r2, r3       ; mov r3, rb31
-++  sub.setf -, r3, 8    ; mov ra8, ra9
-++  mov ra9, ra10
-++  mov ra10, ra11
-++  mov ra11, ra12
-++  mov ra12, ra13
-++  brr.anyn -, r:yloopb
-++  mov ra13, ra14       # Delay slot 1
-++  mov ra14, ra15       # Delay slot 2
-++  mov ra15, r0         # Delay slot 3
-++
-++  # apply vertical filter and write to VPM
-++
-++  nop                     ; mul24 r1, ra14, rb10
-++  nop                     ; mul24 r0, ra13, rb9
-++  add r1, r1, r0          ; mul24 r0, ra12, rb8
-++  add r1, r1, r0          ; mul24 r0, ra15, rb11
-++  add r1, r1, r0          ; mul24 r0, ra8, rb4
-++  add r1, r1, r0          ; mul24 r0, ra9, rb5
-++  add r1, r1, r0          ; mul24 r0, ra10, rb6
-++  add r1, r1, r0          ; mul24 r0, ra11, rb7
-++
-++  add r1, r1, r0          ; mov -, vw_wait
-++  sub.setf -, r3, rb18    ; mul24 r1, r1, ra22
-++  asr r0, r1, 14
-++  asr r1, r1, 6           # Wait state so we can use the rotate instruction
-++  nop                     ; mul24 r0, r0 << 8, ra22 << 8 # Rotate to align left and right halves
-++  add r1, r1, ra18
-++  add r1, r1, r0
-++  brr.anyn -, r:yloopb
-++  asr r1, r1, 15         # Delay 1
-++  min r1, r1, rb22       # Delay 2
-++  max vpm, r1, 0         # Delay 3
-+ 
-+ # DMA out
-+-bra -, ra31
-+-mov vw_setup, rb26 # VDW setup 0    Delay 1
-+-mov vw_setup, rb29 # Stride         Delay 2
-+-mov vw_addr, unif # start the VDW   Delay 3
-++  brr -, r:per_block_setup
-++  mov vw_setup, rb26 # VDW setup 0    Delay 1
-++  mov vw_setup, rb29 # Stride         Delay 2
-++  mov vw_addr, unif # start the VDW   Delay 3
-+ 
-+ ################################################################################
-+ 
-+ # mc_interrupt_exit12()
-+ ::mc_interrupt_exit12
-+-mov  -, vw_wait # wait on the VDW
-+-
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-ldtmu0
-+-
-+-mov -,sacq(0) # 1
-+-mov -,sacq(0) # 2
-+-mov -,sacq(0) # 3
-+-mov -,sacq(0) # 4
-+-mov -,sacq(0) # 5
-+-mov -,sacq(0) # 6
-+-mov -,sacq(0) # 7
-+-mov -,sacq(0) # 8
-+-mov -,sacq(0) # 9
-+-mov -,sacq(0) # 10
-+-mov -,sacq(0) # 11
-+-
-+-nop        ; nop ; thrend
-+-mov interrupt, 1; nop # delay slot 1
-+-nop        ; nop # delay slot 2
-++  mov  -, vw_wait # wait on the VDW
-++
-++  ldtmu0
-++  ldtmu0
-++  ldtmu1
-++  ldtmu1
-++
-++  mov -,sacq(0) # 1
-++  mov -,sacq(0) # 2
-++  mov -,sacq(0) # 3
-++  mov -,sacq(0) # 4
-++  mov -,sacq(0) # 5
-++  mov -,sacq(0) # 6
-++  mov -,sacq(0) # 7
-++  mov -,sacq(0) # 8
-++  mov -,sacq(0) # 9
-++  mov -,sacq(0) # 10
-++  mov -,sacq(0) # 11
-++
-++  nop        ; nop ; thrend
-++  mov interrupt, 1; nop # delay slot 1
-++  nop        ; nop # delay slot 2
-++
-++
-++::mc_exit1
-++  mov  -, vw_wait # wait on the VDW
-++
-++  ldtmu0
-++  ldtmu1
-++  ldtmu0
-++  ldtmu1
-++  nop        ; nop ; thrend
-++  mov interrupt, 1; nop # delay slot 1
-++  nop        ; nop # delay slot 2
-+ 
-+ 
-+ ::mc_end
-+-- 
-+2.5.0
-+
-+
-+From 2be17e0759404007c938bdd478e1e76445d9ecbe Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 2 Jun 2015 10:58:25 +0100
-+Subject: [PATCH 48/68] Added option to simulate QPUs
-+
-+---
-+ libavcodec/hevc.c          | 288 +++++++++++++++++++++++++++++++++++++++++++--
-+ libavcodec/rpi_qpu.c       |  24 ++--
-+ libavcodec/rpi_shader.qasm |   6 +-
-+ 3 files changed, 295 insertions(+), 23 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index c6b619b..7914afb 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -54,6 +54,8 @@
-+   // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-+   // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
-+ 
-++  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
-++  //#define RPI_SIMULATE_QPUS
-+ 
-+ #endif
-+ 
-+@@ -122,7 +124,6 @@ static void pic_arrays_free(HEVCContext *s)
-+ 
-+ #ifdef EARLY_MALLOC
-+ #else
-+-    printf("pic_arrays_free\n");
-+     if (s->coeffs_buf_arm[0]) {
-+       gpu_free(&s->coeffs_buf_default);
-+       s->coeffs_buf_arm[0] = 0;
-+@@ -172,11 +173,9 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+ #ifdef RPI
-+ #ifdef EARLY_MALLOC
-+ #else
-+-    assert(sps);
-++    av_assert0(sps);
-+     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-+-    printf("pic_arrays_init\n");
-+-    printf("Allocated %d\n",coefs_per_row);
-+     gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+     s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+     if (!s->coeffs_buf_arm[0])
-+@@ -2975,6 +2974,274 @@ static void rpi_inter_clear(HEVCContext *s)
-+ #endif
-+ }
-+ 
-++
-++#ifdef RPI_SIMULATE_QPUS
-++
-++static int32_t clipx(int x,int FRAME_WIDTH)
-++{
-++	if (x<=0) return 0;
-++	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
-++	return x;
-++}
-++
-++static int32_t clipy(int y,int FRAME_HEIGHT)
-++{
-++	if (y<=0) return 0;
-++	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
-++	return y;
-++}
-++
-++/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
-++{
-++   int32_t vsum = 0;
-++   int x, y;
-++
-++   for (y = 0; y < 8; y++) {
-++      int32_t hsum = 0;
-++
-++      for (x = 0; x < 8; x++)
-++         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
-++
-++      vsum += lumaFilter[my][y]*hsum;
-++   }
-++   vsum >>= 6;
-++   vsum = (((vsum*weight)+round)>>denom)+offset;
-++
-++   return av_clip_uint8( vsum );
-++}*/
-++
-++static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-++{
-++  int32_t vsum = 0;
-++  int x, y;
-++  int chromaFilterH[4];
-++  int chromaFilterV[4];
-++  int i;
-++  int offset_after = offset_weight>>16;
-++  int weight = (offset_weight<<16)>>16;
-++  for(i=0;i<4;i++) {
-++    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
-++    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
-++  }
-++
-++   for (y = 0; y < 4; y++) {
-++      int32_t hsum = 0;
-++
-++      for (x = 0; x < 4; x++)
-++         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-++
-++      vsum += chromaFilterV[y]*hsum;
-++   }
-++   vsum >>= 6;
-++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-++
-++   return vsum;
-++}
-++
-++int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
-++
-++static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-++{
-++  int32_t vsum = 0;
-++  int x, y;
-++  int i;
-++  int offset_after = offset_weight>>16;
-++  int weight = (offset_weight<<16)>>16;
-++
-++   for (y = 0; y < 8; y++) {
-++      int32_t hsum = 0;
-++
-++      for (x = 0; x < 8; x++)
-++         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-++
-++      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
-++   }
-++   vsum >>= 6;
-++   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-++
-++   return vsum;
-++}
-++
-++static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, int cIdx)
-++{
-++  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
-++  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
-++  int pitch = frame->linesize[cIdx];
-++  uint32_t base = get_vc_address(frame->buf[cIdx]);
-++  if (p>=base && p<base+pitch*pic_height) {
-++    return frame->data[cIdx] + (p-base);
-++  }
-++  return NULL;
-++}
-++
-++static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
-++{
-++  SliceHeader *sh   = &s->sh;
-++  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
-++  int i;
-++  if (arm) return arm;
-++  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
-++  {
-++    for(i=0;i<sh->nb_refs[L0];i++) {
-++      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
-++      if (arm) return arm;
-++    }
-++  }
-++  if (sh->slice_type == B_SLICE) {
-++    for(i=0;i<sh->nb_refs[L1];i++) {
-++      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
-++      if (arm) return arm;
-++    }
-++  }
-++  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
-++  exit(-1);
-++  return NULL;
-++}
-++
-++static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
-++{
-++  uint32_t next_kernel;
-++  uint32_t x0;
-++  uint32_t y0;
-++  uint8_t *ref_u_base;
-++  uint8_t *ref_v_base;
-++  uint32_t frame_width = p[5];
-++  uint32_t frame_height = p[6];
-++  uint32_t pitch = p[7];
-++  uint32_t dst_pitch = p[8];
-++  int32_t offset_before = p[9];
-++  int32_t denom = p[10];
-++  uint32_t vpm_id = p[11];
-++  uint32_t tmp_u_dst[256];
-++  uint32_t tmp_v_dst[256];
-++  while(1) {
-++    p += 12;
-++    next_kernel = p[0-12];
-++    x0 = p[1-12];
-++    y0 = p[2-12];
-++    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
-++      int x,y;
-++      uint32_t width_height = p[5];
-++      uint32_t hcoeffs = p[6];
-++      uint32_t vcoeffs = p[7];
-++      uint32_t offset_weight_u = p[8];
-++      uint32_t offset_weight_v = p[9];
-++      uint8_t *this_u_dst;
-++      uint8_t *this_v_dst;
-++      uint32_t width = width_height >> 16;
-++      uint32_t height = (width_height << 16) >> 16;
-++      ref_u_base = compute_arm_addr(s,p[3-12],1);
-++      ref_v_base = compute_arm_addr(s,p[4-12],2);
-++      if (next_kernel!=s->mc_filter_uv_b0)
-++      {
-++        this_u_dst = compute_arm_addr(s,p[10],1);
-++        this_v_dst = compute_arm_addr(s,p[11],2);
-++      }
-++      for (y=0; y<height; ++y) {
-++        for (x=0; x<width; ++x) {
-++          if (next_kernel==s->mc_filter_uv) {
-++            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
-++            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
-++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-++          } else if (next_kernel==s->mc_filter_uv_b0) {
-++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-++            tmp_u_dst[x+y*16] = refa;
-++            tmp_v_dst[x+y*16] = refb;
-++          } else {
-++            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
-++            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
-++            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-++            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-++          }
-++        }
-++      }
-++    } else {
-++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-++      break;
-++    }
-++  }
-++}
-++
-++// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
-++{
-++  uint32_t next_kernel;
-++  int y_x,y2_x2;
-++  uint32_t x0;
-++  uint32_t y0;
-++  uint32_t x2;
-++  uint32_t y2;
-++  uint8_t *ref_y_base;
-++  uint8_t *ref_y2_base;
-++  uint32_t frame_width_height = p[4];
-++  uint32_t frame_width = frame_width_height>>16;
-++  uint32_t frame_height = (frame_width_height<<16)>>16;
-++  uint32_t pitch = p[5];
-++  uint32_t dst_pitch = p[6];
-++  int offset_shift = p[7];
-++  int32_t offset_before = offset_shift>>16;
-++  int32_t denom = (offset_shift<<16)>>16;
-++  while(1) {
-++    p += 9;
-++    next_kernel = p[8-9];
-++    y_x = p[0-9];
-++    x0 = (y_x<<16)>>16;
-++    y0 = y_x>>16;
-++    y2_x2 = p[2-9];
-++    x2 = (y2_x2<<16)>>16;
-++    y2 = y2_x2>>16;
-++
-++    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
-++      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-++      int x,y;
-++      uint32_t width_height = p[4];
-++      uint32_t my2_mx2_my_mx = p[5];
-++      uint32_t offset_weight = p[6];
-++      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-++      uint32_t width = width_height >> 16;
-++      uint32_t height = (width_height << 16) >> 16;
-++      ref_y_base = compute_arm_addr(s,p[1-9],0);
-++      ref_y2_base = compute_arm_addr(s,p[3-9],0);
-++      for (y=0; y<height; ++y) {
-++        for (x=0; x<width; ++x) {
-++          if (next_kernel==s->mc_filter) {
-++            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-++            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-++          }
-++          else {
-++            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-++            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
-++            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-++          }
-++        }
-++      }
-++    } else {
-++      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-++      break;
-++    }
-++  }
-++}
-++
-++static void rpi_simulate_inter_qpu(HEVCContext *s)
-++{
-++  // First run the transform as normal
-++  int i;
-++  rpi_execute_transform(s);
-++  for(i=0;i<8;i++)
-++  {
-++    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
-++  }
-++  for(i=0;i<12;i++)
-++  {
-++    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
-++  }
-++}
-++
-++#endif
-++
-++
-+ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ {
-+     int k;
-+@@ -2993,7 +3260,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+         s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+-        assert(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-++        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-+     }
-+ 
-+     s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+@@ -3003,11 +3270,16 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+         s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+-        assert(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-++        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-+     }
-+     s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ #endif
-+ 
-++#ifdef RPI_SIMULATE_QPUS
-++    rpi_simulate_inter_qpu(s);
-++    s->vpu_id = -1;
-++    return;
-++#endif
-+ 
-+ #ifdef RPI_MULTI_MAILBOX
-+ #ifdef RPI_CACHE_UNIF_MVS
-+@@ -3088,7 +3360,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+                     && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
-+ #endif
-+ 
-+-    /*if (!s->enable_rpi) {
-++    if (!s->enable_rpi) {
-+       if (s->ps.pps->cross_component_prediction_enabled_flag)
-+         printf("Cross component\n");
-+       if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
-+@@ -3097,7 +3369,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         printf("Weighted P slice\n");
-+       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-+         printf("Weighted B slice\n");
-+-    }*/
-++    }
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index e12304b..4480f72 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -13,7 +13,7 @@
-+ #include <stdlib.h>
-+ #include <string.h>
-+ #include <stddef.h>
-+-#include <assert.h>
-++#include "libavutil/avassert.h"
-+ 
-+ #include "config.h"
-+ 
-+@@ -160,13 +160,13 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   // Now copy over the QPU code into GPU memory
-+   {
-+     int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
-+-    assert(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-++    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+     memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-+   }
-+   // And the VPU code
-+   {
-+     int num_bytes = sizeof(rpi_hevc_transform);
-+-    assert(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+     memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
-+   }
-+   // And the transform coefficients
-+@@ -216,13 +216,13 @@ static void gpu_unlock(void) {
-+ static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-+   p->numbytes = numbytes;
-+   p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+-  assert(p->vcsm_handle);
-++  av_assert0(p->vcsm_handle);
-+   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+-  assert(p->vc_handle);
-++  av_assert0(p->vc_handle);
-+   p->arm = vcsm_lock(p->vcsm_handle);
-+-  assert(p->arm);
-++  av_assert0(p->arm);
-+   p->vc = mem_lock(mb, p->vc_handle);
-+-  assert(p->vc);
-++  av_assert0(p->vc);
-+   return 0;
-+ }
-+ 
-+@@ -243,7 +243,7 @@ int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+ 
-+ int gpu_get_mailbox(void)
-+ {
-+-  assert(gpu);
-++  av_assert0(gpu);
-+   return gpu->mb;
-+ }
-+ 
-+@@ -297,13 +297,13 @@ static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+   //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+-  assert(p->vcsm_handle);
-++  av_assert0(p->vcsm_handle);
-+   p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+-  assert(p->vc_handle);
-++  av_assert0(p->vc_handle);
-+   p->arm = vcsm_lock(p->vcsm_handle);
-+-  assert(p->arm);
-++  av_assert0(p->arm);
-+   p->vc = mem_lock(gpu->mb, p->vc_handle);
-+-  assert(p->vc);
-++  av_assert0(p->vc);
-+   return 0;
-+ }
-+ 
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 60d1ec2..0686249 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -149,8 +149,8 @@ add t0s, r0, r1 ; mov ra_frame_base, r2
-+ add t1s, r2, r1
-+ 
-+ mov r2,8
-+-shl rb12,unif, r2 # offset before shift
-+-add rb13,unif,r2  # offset after shift
-++shl rb12,unif,r2 # offset before shift
-++add rb13,unif,r2  # denominator
-+ 
-+ # Compute part of VPM to use for DMA output
-+ mov r2, unif
-+@@ -185,7 +185,7 @@ add t1s, r1, ra_frame_base
-+ 
-+ ################################################################################
-+ 
-+-# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
-++# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
-+ 
-+ # At this point we have already issued two pairs of texture requests for the current block
-+ # ra_x, ra_x16_base point to the current coordinates for this block
-+-- 
-+2.5.0
-+
-+
-+From 70805b593a428f11dcaf1e558214884601f6c44a Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 2 Jun 2015 13:17:50 +0100
-+Subject: [PATCH 49/68] Increased motion vector memory and fixed block size
-+ computation for non-multiple of 2 block sizes
-+
-+---
-+ libavcodec/hevc.c | 50 +++++++++++++++++++++++++++++++-------------------
-+ 1 file changed, 31 insertions(+), 19 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 7914afb..0d947ea 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -81,11 +81,9 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ 
-+ // Split image of 2048 into parts 64 wide
-+ // So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
-+-// Each block of 64*64
-+-// Smallest CTU size is 16x16, so smallest block is 8x8
-+-// Corresponds to a total of 83kbytes over all 12 QPUs
-++// For each block of 64*64 the smallest block size is 8x4
-+ #define RPI_LUMA_COMMAND_WORDS 9
-+-#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*8)) * RPI_LUMA_COMMAND_WORDS)
-++#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-+ 
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+@@ -2029,11 +2027,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             uint32_t *y = s->y_mvs[chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-++                  int bw = nPbW-start_x;
-++                  int bh = nPbH-start_y;
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+-                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                   *y++ = my2_mx2_my_mx;
-+                   if (weight_flag) {
-+                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-+@@ -2076,12 +2076,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      int bw = nPbW_c-start_x;
-++                      int bh = nPbH_c-start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+-                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-+                       if (weight_flag) {
-+@@ -2128,11 +2130,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             uint32_t *y = s->y_mvs[chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-++                  int bw = nPbW-start_x;
-++                  int bh = nPbH-start_y;
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+-                  *y++ = ( (nPbW<16 ? nPbW : 16) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                   *y++ = my2_mx2_my_mx;
-+                   if (weight_flag) {
-+                       *y++ = (s->sh.luma_offset_l0[current_mv.ref_idx[reflist]] << 16) + (s->sh.luma_weight_l0[current_mv.ref_idx[reflist]] & 0xffff);
-+@@ -2176,12 +2180,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      int bw = nPbW_c-start_x;
-++                      int bh = nPbH_c-start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+-                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                       // TODO chroma weight and offset... s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-+@@ -2233,11 +2239,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             uint32_t *y = s->y_mvs[chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-++                  int bw = nPbW-start_x;
-++                  int bh = nPbH-start_y;
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[0]);
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[0]);
-+-                  *y++ = ( (nPbW<8 ? nPbW : 8) << 16 ) + (nPbH<16 ? nPbH : 16);
-++                  *y++ = ( (bw<8 ? bw : 8) << 16 ) + (bh<16 ? bh : 16);
-+                   *y++ = my2_mx2_my_mx;
-+                   *y++ = 1; // B frame weighted prediction not supported
-+                   *y++ = (get_vc_address(s->frame->buf[0]) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+@@ -2280,12 +2288,14 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 uint32_t *u = s->u_mvs[chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-++                      int bw = nPbW_c-start_x;
-++                      int bh = nPbH_c-start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref0->frame->buf[2]);
-+-                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                       *u++ = rpi_filter_coefs[_mx][0];
-+                       *u++ = rpi_filter_coefs[_my][0];
-+                       u+=2; // Weights not supported in B slices
-+@@ -2296,7 +2306,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[1]);
-+                       u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address(ref1->frame->buf[2]);
-+-                      *u++ = ( (nPbW_c<RPI_CHROMA_BLOCK_WIDTH ? nPbW_c : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (nPbH_c<16 ? nPbH_c : 16);
-++                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                       *u++ = rpi_filter_coefs[_mx2][0];
-+                       *u++ = rpi_filter_coefs[_my2][0];
-+                       u+=2; // Weights not supported in B slices
-+@@ -3165,14 +3175,15 @@ static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
-+ }
-+ 
-+ // mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+-static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
-++static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
-+ {
-+   uint32_t next_kernel;
-+   int y_x,y2_x2;
-+-  uint32_t x0;
-+-  uint32_t y0;
-+-  uint32_t x2;
-+-  uint32_t y2;
-++  int x0;
-++  int y0;
-++  int x2;
-++  int y2;
-++  uint32_t *p0 = p;
-+   uint8_t *ref_y_base;
-+   uint8_t *ref_y2_base;
-+   uint32_t frame_width_height = p[4];
-+@@ -3202,13 +3213,15 @@ static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p)
-+       uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-+       uint32_t width = width_height >> 16;
-+       uint32_t height = (width_height << 16) >> 16;
-++      uint8_t *dst_base = s->frame->data[0];
-+       ref_y_base = compute_arm_addr(s,p[1-9],0);
-+       ref_y2_base = compute_arm_addr(s,p[3-9],0);
-+       for (y=0; y<height; ++y) {
-+         for (x=0; x<width; ++x) {
-+           if (next_kernel==s->mc_filter) {
-+             int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-+-            this_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-++            refa = av_clip_uint8(refa);
-++            this_dst[x+y*dst_pitch] = refa;
-+           }
-+           else {
-+             int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-+@@ -3235,7 +3248,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-+   }
-+   for(i=0;i<12;i++)
-+   {
-+-    rpi_simulate_inter_luma(s,s->y_mvs_base[i]);
-++    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
-+   }
-+ }
-+ 
-+@@ -3277,7 +3290,6 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI_SIMULATE_QPUS
-+     rpi_simulate_inter_qpu(s);
-+-    s->vpu_id = -1;
-+     return;
-+ #endif
-+ 
-+-- 
-+2.5.0
-+
-+
-+From 1bd38623db52970590df65f4a7338d924c63a781 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 2 Jun 2015 14:36:54 +0100
-+Subject: [PATCH 50/68] Added support for skip deblock
-+
-+---
-+ libavcodec/hevc.c        |  5 +++++
-+ libavcodec/hevc.h        |  2 ++
-+ libavcodec/hevc_filter.c | 14 ++++----------
-+ 3 files changed, 11 insertions(+), 10 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 0d947ea..1812801 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3384,6 +3384,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     }
-+ 
-+ #endif
-++    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-++                        s->nal_unit_type == NAL_TSA_N   ||
-++                        s->nal_unit_type == NAL_STSA_N  ||
-++                        s->nal_unit_type == NAL_RADL_N  ||
-++                        s->nal_unit_type == NAL_RASL_N);
-+ 
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 33dedf7..aa4d218 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -916,6 +916,8 @@ typedef struct HEVCContext {
-+     int                 width;
-+     int                 height;
-+ 
-++    int used_for_ref;
-++
-+ #ifdef RPI
-+     int enable_rpi;
-+     HEVCMvCmd *unif_mv_cmds;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 11629e4..14a0952 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -512,16 +512,14 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                s->ps.pps->transquant_bypass_enable_flag;
-+ 
-+ #ifdef DISABLE_DEBLOCK_NONREF
-+-    if (    s->nal_unit_type == NAL_TRAIL_N ||
-+-            s->nal_unit_type == NAL_TSA_N   ||
-+-            s->nal_unit_type == NAL_STSA_N  ||
-+-            s->nal_unit_type == NAL_RADL_N  ||
-+-            s->nal_unit_type == NAL_RASL_N )
-++    if (!s->used_for_ref)
-+       return; // Don't deblock non-reference frames
-+ #endif
-+ #ifdef DISABLE_DEBLOCK
-+     return;
-+ #endif
-++    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
-++        return;
-+ 
-+     if (x0) {
-+         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
-+@@ -885,11 +883,7 @@ static int ff_hevc_buf_base(AVBufferRef *bref) {
-+ 
-+ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+-    if (s->enable_rpi && !(  s->nal_unit_type == NAL_TRAIL_N ||
-+-            s->nal_unit_type == NAL_TSA_N   ||
-+-            s->nal_unit_type == NAL_STSA_N  ||
-+-            s->nal_unit_type == NAL_RADL_N  ||
-+-            s->nal_unit_type == NAL_RASL_N )) {
-++    if (s->enable_rpi && s->used_for_ref) {
-+ #ifdef RPI_FAST_CACHEFLUSH
-+         struct vcsm_user_clean_invalid_s iocache = {};
-+         int curr_y = ((int *)f->progress->data)[0];
-+-- 
-+2.5.0
-+
-+
-+From 691cba7253bc997f6e8020542203c5733930d997 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Tue, 2 Jun 2015 15:22:52 +0100
-+Subject: [PATCH 51/68] Added support for skip_frame
-+
-+---
-+ libavcodec/hevc.c | 15 ++++++++++-----
-+ 1 file changed, 10 insertions(+), 5 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 1812801..94ff677 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3384,11 +3384,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     }
-+ 
-+ #endif
-+-    s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-+-                        s->nal_unit_type == NAL_TSA_N   ||
-+-                        s->nal_unit_type == NAL_STSA_N  ||
-+-                        s->nal_unit_type == NAL_RADL_N  ||
-+-                        s->nal_unit_type == NAL_RASL_N);
-+ 
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+@@ -3848,6 +3843,16 @@ static int decode_nal_unit(HEVCContext *s, const HEVCNAL *nal)
-+         if (ret < 0)
-+             return ret;
-+ 
-++        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
-++                        s->nal_unit_type == NAL_TSA_N   ||
-++                        s->nal_unit_type == NAL_STSA_N  ||
-++                        s->nal_unit_type == NAL_RADL_N  ||
-++                        s->nal_unit_type == NAL_RASL_N);
-++
-++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
-++            s->is_decoded = 0;
-++            break;
-++        }
-+         if (s->max_ra == INT_MAX) {
-+             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
-+                 s->max_ra = s->poc;
-+-- 
-+2.5.0
-+
-+
-+From b489872a14709b7e04285e039dff80b75823eb72 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 09:15:38 +0100
-+Subject: [PATCH 52/68] Fixed cache flushing of luma when using old method
-+
-+---
-+ libavcodec/hevc_filter.c | 2 +-
-+ 1 file changed, 1 insertion(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 14a0952..b286bbf 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -919,7 +919,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+         flush_buffer(s->frame->buf[1]);
-+         flush_buffer(s->frame->buf[2]);
-+ #ifdef RPI_LUMA_QPU
-+-        flush_buffer(s->frame->buf[1]);
-++        flush_buffer(s->frame->buf[0]);
-+ #endif
-+ 
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From 6a4811cba68b1c27326300b37e43cdbad45ec45e Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 11:37:27 +0100
-+Subject: [PATCH 53/68] Option to parallelise coefficient decode and inter
-+ prediction and deblock for each frame
-+
-+---
-+ libavcodec/hevc.c              | 701 +++++++++++++++++++++++++++--------------
-+ libavcodec/hevc.h              |  74 +++--
-+ libavcodec/hevc_cabac.c        |  12 +-
-+ libavcodec/hevcpred_template.c |   5 +-
-+ 4 files changed, 522 insertions(+), 270 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 94ff677..594340a 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -41,8 +41,6 @@
-+ 
-+ #ifdef RPI
-+   #include "rpi_qpu.h"
-+-  // For some unknown reason, the code seems to crash if I do a late malloc
-+-  //#define EARLY_MALLOC
-+   // Move Inter prediction into separate pass
-+   #define RPI_INTER
-+ 
-+@@ -56,6 +54,21 @@
-+ 
-+   // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs
-+   //#define RPI_SIMULATE_QPUS
-++  #ifdef RPI_WORKER
-++    #include "pthread.h"
-++  #endif
-++
-++  static void rpi_execute_dblk_cmds(HEVCContext *s);
-++  static void rpi_execute_transform(HEVCContext *s);
-++  static void rpi_execute_inter_qpu(HEVCContext *s);
-++  static void rpi_execute_pred_cmds(HEVCContext *s);
-++  static void rpi_execute_inter_cmds(HEVCContext *s);
-++  static void rpi_inter_clear(HEVCContext *s);
-++
-++  // Define INTER_PASS0 to do inter prediction in first pass
-++  //#define INTER_PASS0
-++  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
-++  //#define LAUNCH_PASS0
-+ 
-+ #endif
-+ 
-+@@ -103,6 +116,143 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
-+   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+   return p->vc;
-+ }
-++#endif
-++
-++
-++#ifdef RPI_WORKER
-++
-++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
-++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
-++
-++#define LOG_ENTER
-++#define LOG_EXIT
-++
-++// Call this when we have completed pass0 and wish to trigger pass1 for the current job
-++static void worker_submit_job(HEVCContext *s)
-++{
-++  LOG_ENTER
-++  //pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
-++  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-++  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-++  //pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++// Call this to say we have completed pass1
-++static void worker_complete_middle_job(HEVCContext *s)
-++{
-++  LOG_ENTER
-++  //pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
-++  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-++  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
-++  //pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++// Call this to say we have completed pass2
-++static void worker_complete_job(HEVCContext *s)
-++{
-++  LOG_ENTER
-++  //pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_head++; // This is the only place that can change head so we do not need the mutex
-++  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-++  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
-++  //pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++// Call this to wait for all jobs to have completed at the end of a frame
-++static void worker_wait(HEVCContext *s)
-++{
-++  LOG_ENTER
-++  pthread_mutex_lock(&s->worker_mutex);
-++  while( s->worker_head !=s->worker_tail)
-++  {
-++    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-++  }
-++  pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-++// available to receive the next job.
-++static void worker_pass0_ready(HEVCContext *s)
-++{
-++  LOG_ENTER
-++    pthread_mutex_lock(&s->worker_mutex);
-++    // tail is number of submitted jobs
-++    // head is number of completed jobs
-++    // tail-head is number of outstanding jobs in the queue
-++    // we need to ensure there is at least 1 space left for us to use
-++    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
-++    {
-++      // Wait until another job is completed
-++      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-++    }
-++    pthread_mutex_unlock(&s->worker_mutex);
-++  LOG_EXIT
-++}
-++
-++static void *worker_start(void *arg)
-++{
-++  HEVCContext *s = (HEVCContext *)arg;
-++  while(1) {
-++    pthread_mutex_lock(&s->worker_mutex);
-++
-++    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
-++    {
-++      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-++    }
-++    pthread_mutex_unlock(&s->worker_mutex);
-++
-++    if (s->kill_worker) {
-++      break;
-++    }
-++    LOG_ENTER
-++    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-++#ifndef LAUNCH_PASS0
-++    rpi_execute_inter_qpu(s);
-++#endif
-++#ifndef INTER_PASS0
-++    // Perform inter prediction
-++    rpi_execute_inter_cmds(s);
-++#endif
-++    // Wait for transform completion
-++    vpu_wait(s->vpu_id);
-++
-++    worker_complete_middle_job(s);
-++    LOG_EXIT
-++  }
-++  return NULL;
-++}
-++
-++static void *worker_deblock_start(void *arg)
-++{
-++  HEVCContext *s = (HEVCContext *)arg;
-++  while(1) {
-++    pthread_mutex_lock(&s->worker_mutex);
-++    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
-++    {
-++      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
-++    }
-++    pthread_mutex_unlock(&s->worker_mutex);
-++
-++    if (s->kill_worker) {
-++      break;
-++    }
-++    LOG_ENTER
-++    // Perform intra prediction and residual reconstruction
-++    rpi_execute_pred_cmds(s);
-++    // Perform deblocking for CTBs in this row
-++    rpi_execute_dblk_cmds(s);
-++
-++    worker_complete_job(s);
-++    LOG_EXIT
-++  }
-++  return NULL;
-++}
-+ 
-+ #endif
-+ 
-+@@ -119,19 +269,18 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
-+ static void pic_arrays_free(HEVCContext *s)
-+ {
-+ #ifdef RPI
-+-
-+-#ifdef EARLY_MALLOC
-+-#else
-+-    if (s->coeffs_buf_arm[0]) {
-+-      gpu_free(&s->coeffs_buf_default);
-+-      s->coeffs_buf_arm[0] = 0;
-+-    }
-+-    if (s->coeffs_buf_arm[2]) {
-+-      gpu_free(&s->coeffs_buf_accelerated);
-+-      s->coeffs_buf_arm[2] = 0;
-++    int job;
-++    for(job=0;job<RPI_MAX_JOBS;job++) {
-++      if (s->coeffs_buf_arm[job][0]) {
-++        gpu_free(&s->coeffs_buf_default[job]);
-++        s->coeffs_buf_arm[job][0] = 0;
-++      }
-++      if (s->coeffs_buf_arm[job][2]) {
-++        gpu_free(&s->coeffs_buf_accelerated[job]);
-++        s->coeffs_buf_arm[job][2] = 0;
-++      }
-+     }
-+ #endif
-+-#endif
-+     av_freep(&s->sao);
-+     av_freep(&s->deblock);
-+ 
-+@@ -169,24 +318,26 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
-+ 
-+ #ifdef RPI
-+-#ifdef EARLY_MALLOC
-+-#else
-+     av_assert0(sps);
-+     int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+     int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-+-    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+-    s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+-    if (!s->coeffs_buf_arm[0])
-+-        goto fail;
-+-    gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+-    s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+-    s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+-    if (!s->coeffs_buf_arm[2])
-+-        goto fail;
-+-    s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+-    s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+-    printf("Done\n");
-+-#endif
-++    int job;
-++    for(job=0;job<RPI_MAX_JOBS;job++) {
-++      printf("Allocated %d\n",coefs_per_row);
-++      for(job=0;job<RPI_MAX_JOBS;job++) {
-++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-++        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-++        if (!s->coeffs_buf_arm[job][0])
-++            goto fail;
-++        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
-++        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-++        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-++        if (!s->coeffs_buf_arm[job][2])
-++            goto fail;
-++        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
-++        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-++      }
-++    }
-+ #endif
-+ 
-+     s->bs_width  = (width  >> 2) + 1;
-+@@ -1023,7 +1174,7 @@ static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0,
-+ {
-+     if (s->enable_rpi) {
-+         HEVCLocalContext *lc = s->HEVClc;
-+-        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+         cmd->type = RPI_PRED_INTRA;
-+         cmd->size = log2_trafo_size;
-+         cmd->c_idx = c_idx;
-+@@ -1483,7 +1634,7 @@ static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                         int block_w, int block_h, int luma_weight, int luma_offset)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+     cmd->cmd = RPI_CMD_LUMA_UNI;
-+     cmd->dst = dst;
-+     cmd->dststride = dststride;
-+@@ -1502,7 +1653,7 @@ static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+     cmd->cmd = RPI_CMD_LUMA_BI;
-+     cmd->dst = dst;
-+     cmd->dststride = dststride;
-+@@ -1524,7 +1675,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-+                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+     cmd->cmd = RPI_CMD_CHROMA_UNI;
-+     cmd->dst = dst0;
-+     cmd->dststride = dststride;
-+@@ -1542,7 +1693,7 @@ static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+ static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds + s->num_mv_cmds++;
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
-+     cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-+     cmd->dst = dst0;
-+     cmd->dststride = dststride;
-+@@ -2024,7 +2175,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int chan = x0>>6; // 64 wide blocks per QPU
-+             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+-            uint32_t *y = s->y_mvs[chan % 12];
-++            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                   int bw = nPbW-start_x;
-+@@ -2044,7 +2195,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                 }
-+             }
-+-            s->y_mvs[chan % 12] = y;
-++            s->y_mvs[s->pass0_job][chan % 12] = y;
-+         } else
-+ #endif
-+         {
-+@@ -2073,7 +2224,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+-                uint32_t *u = s->u_mvs[chan & 7];
-++                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2097,7 +2248,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[chan & 7] = u;
-++                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2127,7 +2278,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int chan = x0>>6; // 64 wide blocks per QPU
-+             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+-            uint32_t *y = s->y_mvs[chan % 12];
-++            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                   int bw = nPbW-start_x;
-+@@ -2147,7 +2298,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                 }
-+             }
-+-            s->y_mvs[chan % 12] = y;
-++            s->y_mvs[s->pass0_job][chan % 12] = y;
-+         } else
-+ #endif
-+ 
-+@@ -2177,7 +2328,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+-                uint32_t *u = s->u_mvs[chan & 7];
-++                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2202,7 +2353,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[chan & 7] = u;
-++                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2236,7 +2387,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int x2 = x0 + (mv2->x >> 2);
-+             int y2 = y0 + (mv2->y >> 2);
-+             int chan = x0>>6; // 64 wide blocks per QPU
-+-            uint32_t *y = s->y_mvs[chan % 12];
-++            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                   int bw = nPbW-start_x;
-+@@ -2252,7 +2403,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-+                 }
-+             }
-+-            s->y_mvs[chan % 12] = y;
-++            s->y_mvs[s->pass0_job][chan % 12] = y;
-+         } else
-+ #endif
-+         {
-+@@ -2285,7 +2436,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+ 
-+                 int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+ 
-+-                uint32_t *u = s->u_mvs[chan & 7];
-++                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2314,7 +2465,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[chan & 7] = u;
-++                s->u_mvs[s->pass0_job][chan & 7] = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2819,40 +2970,54 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+ static void rpi_execute_dblk_cmds(HEVCContext *s)
-+ {
-+     int n;
-++    int job = s->pass2_job;
-+     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+-    int (*p)[2] = s->dblk_cmds;
-+-    for(n = s->num_dblk_cmds; n>0 ;n--,p++) {
-++    int (*p)[2] = s->dblk_cmds[job];
-++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-+         ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
-+     }
-+-    s->num_dblk_cmds = 0;
-++    s->num_dblk_cmds[job] = 0;
-+ }
-+ 
-+ static void rpi_execute_transform(HEVCContext *s)
-+ {
-+     int i=2;
-++#ifdef LAUNCH_PASS0
-++    int job = s->pass0_job;
-++#else
-++    int job = s->pass1_job;
-++#endif
-+     //int j;
-+     //int16_t *coeffs = s->coeffs_buf_arm[i];
-+     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+     //    s->hevcdsp.idct[4-2](coeffs, 16);
-+     //}
-+ 
-+-    gpu_cache_flush(&s->coeffs_buf_accelerated);
-+-    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0, &s->coeffs_buf_accelerated);
-++    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-++    s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
-++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
-++                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
-+     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+     //vpu_wait(s->vpu_id);
-+ 
-+     for(i=0;i<4;i++)
-+-        s->num_coeffs[i] = 0;
-++        s->num_coeffs[job][i] = 0;
-+ }
-+ 
-+ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ {
-+   int i;
-+-  HEVCPredCmd *cmd = s->univ_pred_cmds;
-++  int job = s->pass2_job;
-++  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-++#ifdef RPI_WORKER
-++  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-++#else
-+   HEVCLocalContext *lc = s->HEVClc;
-++#endif
-+ 
-+-  for(i = s->num_pred_cmds; i > 0; i--, cmd++) {
-++  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
-++      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
-+       if (cmd->type == RPI_PRED_INTRA) {
-+           lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-+           lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-+@@ -2871,21 +3036,26 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ #endif
-+       }
-+   }
-+-  s->num_pred_cmds = 0;
-++  s->num_pred_cmds[job] = 0;
-+ }
-+ 
-+ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ {
-+-    HEVCMvCmd *cmd = s->unif_mv_cmds;
-++#ifdef INTER_PASS0
-++    int job = s->pass0_job;
-++#else
-++    int job = s->pass1_job;
-++#endif
-++    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-+     int n,cidx;
-+     AVFrame myref;
-+     AVFrame myref1;
-+     struct MvField mymv;
-+-    if (s->num_mv_cmds > RPI_MAX_MV_CMDS) {
-++    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
-+         printf("Overflow inter_cmds\n");
-+         exit(-1);
-+     }
-+-    for(n = s->num_mv_cmds; n>0 ; n--, cmd++) {
-++    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
-+         switch(cmd->cmd) {
-+         case RPI_CMD_LUMA_UNI:
-+             myref.data[0] = cmd->src;
-+@@ -2925,7 +3095,28 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-+             break;
-+         }
-+     }
-+-    s->num_mv_cmds = 0;
-++    s->num_mv_cmds[job] = 0;
-++}
-++
-++static void rpi_do_all_passes(HEVCContext *s)
-++{
-++#ifdef RPI_INTER_QPU
-++    // Kick off inter prediction on QPUs
-++    rpi_execute_inter_qpu(s);
-++#else
-++    rpi_execute_transform(s);
-++#endif
-++    // Perform luma inter prediction
-++    rpi_execute_inter_cmds(s);
-++    // Wait for transform completion
-++    vpu_wait(s->vpu_id);
-++    // Perform intra prediction and residual reconstruction
-++    rpi_execute_pred_cmds(s);
-++    // Perform deblocking for CTBs in this row
-++    rpi_execute_dblk_cmds(s);
-++#ifdef RPI_INTER_QPU
-++    rpi_inter_clear(s);
-++#endif
-+ }
-+ 
-+ #endif
-+@@ -2933,6 +3124,7 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ #ifdef RPI_INTER_QPU
-+ static void rpi_inter_clear(HEVCContext *s)
-+ {
-++    int job = s->pass0_job;
-+     int i;
-+     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-+@@ -2940,51 +3132,50 @@ static void rpi_inter_clear(HEVCContext *s)
-+                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+     for(i=0;i<8;i++) {
-+-        s->u_mvs[i] = s->mvs_base[i];
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = 0;
-+-        *s->u_mvs[i]++ = pic_width;
-+-        *s->u_mvs[i]++ = pic_height;
-+-        *s->u_mvs[i]++ = s->frame->linesize[1];
-+-        *s->u_mvs[i]++ = s->frame->linesize[2];
-++        s->u_mvs[job][i] = s->mvs_base[job][i];
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = 0;
-++        *s->u_mvs[job][i]++ = pic_width;
-++        *s->u_mvs[job][i]++ = pic_height;
-++        *s->u_mvs[job][i]++ = s->frame->linesize[1];
-++        *s->u_mvs[job][i]++ = s->frame->linesize[2];
-+         if (weight_flag) {
-+-            *s->u_mvs[i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-+-            *s->u_mvs[i]++ = s->sh.chroma_log2_weight_denom + 6;
-++            *s->u_mvs[job][i]++ = 1 << (s->sh.chroma_log2_weight_denom + 6 - 1);
-++            *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
-+         } else {
-+-            *s->u_mvs[i]++ = 1 << 5;
-+-            *s->u_mvs[i]++ = 6;
-++            *s->u_mvs[job][i]++ = 1 << 5;
-++            *s->u_mvs[job][i]++ = 6;
-+         }
-+-        *s->u_mvs[i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-++        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+     }
-+ 
-+ #ifdef RPI_LUMA_QPU
-+     for(i=0;i<12;i++) {
-+-        s->y_mvs[i] = s->y_mvs_base[i];
-+-        *s->y_mvs[i]++ = 0; // y_x
-+-        *s->y_mvs[i]++ = 0; // ref_y_base
-+-        *s->y_mvs[i]++ = 0; // y2_x2
-+-        *s->y_mvs[i]++ = 0; // ref_y2_base
-+-        *s->y_mvs[i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-+-        *s->y_mvs[i]++ = s->frame->linesize[0]; // pitch
-+-        *s->y_mvs[i]++ = s->frame->linesize[0]; // dst_pitch
-++        s->y_mvs[job][i] = s->y_mvs_base[job][i];
-++        *s->y_mvs[job][i]++ = 0; // y_x
-++        *s->y_mvs[job][i]++ = 0; // ref_y_base
-++        *s->y_mvs[job][i]++ = 0; // y2_x2
-++        *s->y_mvs[job][i]++ = 0; // ref_y2_base
-++        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
-++        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
-+         if (weight_flag) {
-+             int offset = 1 << (s->sh.luma_log2_weight_denom + 6 - 1);
-+             int shift = s->sh.luma_log2_weight_denom + 6;
-+-            *s->y_mvs[i]++ = (offset << 16) + shift;
-++            *s->y_mvs[job][i]++ = (offset << 16) + shift;
-+         } else {
-+             int offset = 1 << 5;
-+             int shift = 6;
-+-            *s->y_mvs[i]++ = (offset << 16) + shift;
-++            *s->y_mvs[job][i]++ = (offset << 16) + shift;
-+         }
-+-        *s->y_mvs[i]++ = 0; // Next kernel
-++        *s->y_mvs[job][i]++ = 0; // Next kernel
-+     }
-+ #endif
-+ }
-+ 
-+-
-+ #ifdef RPI_SIMULATE_QPUS
-+ 
-+ static int32_t clipx(int x,int FRAME_WIDTH)
-+@@ -3258,10 +3449,15 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-+ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ {
-+     int k;
-++#ifdef LAUNCH_PASS0
-++    int job = s->pass0_job;
-++#else
-++    int job = s->pass1_job;
-++#endif
-+     int i;
-+-    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-++    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
-+ #ifdef RPI_LUMA_QPU
-+-    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr.vc;
-++    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
-+ #endif
-+     if (s->sh.slice_type == I_SLICE) {
-+ #ifdef RPI_MULTI_MAILBOX
-+@@ -3270,22 +3466,22 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ #endif
-+     }
-+     for(k=0;k<8;k++) {
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+-        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+-        av_assert0(s->u_mvs[k] - s->mvs_base[k] < UV_COMMANDS_PER_QPU);
-++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-++        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
-+     }
-+ 
-+-    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ 
-+ #ifdef RPI_LUMA_QPU
-+     for(k=0;k<12;k++) {
-+-        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+-        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+-        s->y_mvs[k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+-        av_assert0(s->y_mvs[k] - s->y_mvs_base[k] < Y_COMMANDS_PER_QPU);
-++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-++        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-++        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
-+     }
-+-    s->y_mvs[12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-++    s->y_mvs[job][12-1][-RPI_LUMA_COMMAND_WORDS+8] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+ #endif
-+ 
-+ #ifdef RPI_SIMULATE_QPUS
-+@@ -3295,34 +3491,34 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI_MULTI_MAILBOX
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-    gpu_cache_flush3(&s->coeffs_buf_accelerated,&s->y_unif_mvs_ptr, &s->unif_mvs_ptr);
-++    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-+ #else
-+-    gpu_cache_flush(&s->coeffs_buf_accelerated);
-++    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+ #endif
-+-    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0,
-++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+ #ifdef RPI_LUMA_QPU
-+                                    qpu_get_fn(QPU_MC_SETUP),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[0 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[1 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[2 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[3 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[4 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[5 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[6 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[7 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[8 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[9 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[10 ] - (uint32_t*)s->y_unif_mvs_ptr.arm)),
-+-                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[11 ] - (uint32_t*)s->y_unif_mvs_ptr.arm))
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-++                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
-+ #else
-+                                    0,
-+                                    0,0,0,0,
-+@@ -3331,17 +3527,17 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ #endif
-+                                  );
-+     for(i=0;i<4;i++)
-+-        s->num_coeffs[i] = 0;
-++        s->num_coeffs[job][i] = 0;
-+ #else
-+     qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+-      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+-      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-++      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-++      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
-+       );
-+ #endif
-+ 
-+@@ -3398,6 +3594,11 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         }
-+     }
-+ 
-++#ifdef RPI_WORKER
-++    s->pass0_job = 0;
-++    s->pass1_job = 0;
-++    s->pass2_job = 0;
-++#endif
-+ #ifdef RPI_INTER_QPU
-+     rpi_inter_clear(s);
-+ #endif
-+@@ -3418,46 +3619,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-++
-+ #ifdef RPI
-+         if (s->enable_rpi) {
-+-          s->dblk_cmds[s->num_dblk_cmds][0] = x_ctb;
-+-          s->dblk_cmds[s->num_dblk_cmds++][1] = y_ctb;
-++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-++          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+           if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-+-            // Transform all blocks
-+-            // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+-#ifdef RPI_MULTI_MAILBOX
-+-            // Kick off inter prediction on QPUs
-+-            rpi_execute_inter_qpu(s);
-+-            // Perform luma inter prediction
-+-            rpi_execute_inter_cmds(s);
-+-#else
-+-            rpi_execute_transform(s);
-+-            // Perform inter prediction
-+-            rpi_execute_inter_cmds(s);
-+-#ifdef RPI_INTER_QPU
-+-            // Kick off inter prediction on QPUs
-+-            rpi_execute_inter_qpu(s);
-+-#endif
-+-#endif
-+-
-+-            // Wait for transform completion
-+-            vpu_wait(s->vpu_id);
-+-
-+-            // Copy back reconstructed data
-+-            //memcpy(s->frame->data[0],s->dummy.arm,2048*64);
-+-            //memcpy(s->frame->data[1],s->dummy.arm,1024*32);
-+-            //memcpy(s->frame->data[2],s->dummy.arm,1024*32);
-++#ifdef RPI_WORKER
-++            if (s->used_for_ref) {
-++              // Split work load onto separate threads so we make as rapid progress as possible with this frame
-++  #ifdef INTER_PASS0
-++              rpi_execute_inter_cmds(s);
-++  #endif
-++  #ifdef LAUNCH_PASS0
-++              rpi_execute_inter_qpu(s);
-++  #endif
-++              // Pass on this job to worker thread
-++              worker_submit_job(s);
-++              // Make sure we have space to prepare the next job
-++              worker_pass0_ready(s);
-+ 
-+-            // Perform intra prediction and residual reconstruction
-+-            rpi_execute_pred_cmds(s);
-+-            // Perform deblocking for CTBs in this row
-+-            rpi_execute_dblk_cmds(s);
-++              // Prepare the next batch of commands
-+ #ifdef RPI_INTER_QPU
-+-            rpi_inter_clear(s);
-++              rpi_inter_clear(s);
-++#endif
-++            } else {
-++              // Non-ref frame so do it all on this thread
-++              rpi_do_all_passes(s);
-++            }
-++#else
-++            rpi_do_all_passes(s);
-+ #endif
-+           }
-+         }
-+ #endif
-++
-++
-+         if (more_data < 0) {
-+             s->tab_slice_address[ctb_addr_rs] = -1;
-+             return more_data;
-+@@ -3474,18 +3671,21 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     }
-+ 
-+ #ifdef RPI
-+-    if (s->enable_rpi && s->num_dblk_cmds) {
-+-#ifdef RPI_INTER_QPU
-+-        rpi_execute_inter_qpu(s);
-+-#endif
-+-#ifndef RPI_MULTI_MAILBOX
-+-        rpi_execute_transform(s);
-++
-++#ifdef RPI_WORKER
-++    // Wait for the worker to finish all its jobs
-++    if (s->enable_rpi) {
-++        worker_wait(s);
-++        av_assert0(s->pass0_job==s->pass1_job);
-++        av_assert0(s->pass1_job==s->pass2_job);
-++    }
-+ #endif
-+-        rpi_execute_inter_cmds(s);
-+-        vpu_wait(s->vpu_id);
-+-        rpi_execute_pred_cmds(s);
-+-        rpi_execute_dblk_cmds(s);
-++
-++    // Finish off any half-completed rows
-++    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
-++        rpi_do_all_passes(s);
-+     }
-++
-+ #endif
-+ 
-+     if (x_ctb + ctb_size >= s->ps.sps->width &&
-+@@ -4153,6 +4353,48 @@ fail:
-+     return AVERROR(ENOMEM);
-+ }
-+ 
-++#ifdef RPI_WORKER
-++static av_cold void hevc_init_worker(HEVCContext *s)
-++{
-++    int err;
-++    pthread_cond_init(&s->worker_cond_head, NULL);
-++    pthread_cond_init(&s->worker_cond_middle, NULL);
-++    pthread_cond_init(&s->worker_cond_tail, NULL);
-++    pthread_mutex_init(&s->worker_mutex, NULL);
-++
-++    s->worker_tail=0;
-++    s->worker_middle=0;
-++    s->worker_head=0;
-++    s->kill_worker=0;
-++    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
-++    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
-++    if (err) {
-++        printf("Failed to create worker thread\n");
-++        exit(-1);
-++    }
-++}
-++
-++static av_cold void hevc_exit_worker(HEVCContext *s)
-++{
-++    void *res;
-++    s->kill_worker=1;
-++    pthread_cond_broadcast(&s->worker_cond_tail);
-++    pthread_cond_broadcast(&s->worker_cond_middle);
-++    pthread_join(s->worker_thread, &res);
-++    pthread_join(s->worker_deblock_thread, &res);
-++
-++    pthread_cond_destroy(&s->worker_cond_head);
-++    pthread_cond_destroy(&s->worker_cond_middle);
-++    pthread_cond_destroy(&s->worker_cond_tail);
-++    pthread_mutex_destroy(&s->worker_mutex);
-++
-++    s->worker_tail=0;
-++    s->worker_middle=0;
-++    s->worker_head=0;
-++    s->kill_worker=0;
-++}
-++#endif
-++
-+ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+ {
-+     HEVCContext       *s = avctx->priv_data;
-+@@ -4165,33 +4407,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+     av_freep(&s->cabac_state);
-+ 
-+ #ifdef RPI
-+-    av_freep(&s->unif_mv_cmds);
-+-    av_freep(&s->univ_pred_cmds);
-++
-++#ifdef RPI_WORKER
-++    hevc_exit_worker(s);
-++#endif
-++
-++    for(i=0;i<RPI_MAX_JOBS;i++) {
-++      av_freep(&s->unif_mv_cmds[i]);
-++      av_freep(&s->univ_pred_cmds[i]);
-+ 
-+ #ifdef RPI_INTER_QPU
-+-    if (s->unif_mvs) {
-+-        gpu_free( &s->unif_mvs_ptr );
-+-        s->unif_mvs = 0;
-+-    }
-++      if (s->unif_mvs[i]) {
-++        gpu_free( &s->unif_mvs_ptr[i] );
-++        s->unif_mvs[i] = 0;
-++      }
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-+-    if (s->y_unif_mvs) {
-+-        gpu_free( &s->y_unif_mvs_ptr );
-+-        s->y_unif_mvs = 0;
-+-    }
-++      if (s->y_unif_mvs[i]) {
-++        gpu_free( &s->y_unif_mvs_ptr[i] );
-++        s->y_unif_mvs[i] = 0;
-++      }
-+ #endif
-+-
-+-#ifdef EARLY_MALLOC
-+-    printf("hevc_decode_free\n");
-+-    if (s->coeffs_buf_arm[0]) {
-+-      gpu_free(&s->coeffs_buf_default);
-+-      s->coeffs_buf_arm[0] = 0;
-+-    }
-+-    if (s->coeffs_buf_arm[2]) {
-+-      gpu_free(&s->coeffs_buf_accelerated);
-+-      s->coeffs_buf_arm[2] = 0;
-+     }
-+-#endif
-++
-+ #endif
-+ 
-+     for (i = 0; i < 3; i++) {
-+@@ -4256,6 +4494,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+ {
-+     HEVCContext *s = avctx->priv_data;
-+     int i;
-++    int job;
-+ 
-+     s->avctx = avctx;
-+ 
-+@@ -4266,12 +4505,14 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     s->sList[0] = s;
-+ 
-+ #ifdef RPI
-+-    s->unif_mv_cmds = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+-    if (!s->unif_mv_cmds)
-+-        goto fail;
-+-    s->univ_pred_cmds = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+-    if (!s->univ_pred_cmds)
-+-        goto fail;
-++    for(job=0;job<RPI_MAX_JOBS;job++) {
-++        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-++        if (!s->unif_mv_cmds[job])
-++            goto fail;
-++        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-++        if (!s->univ_pred_cmds[job])
-++            goto fail;
-++    }
-+ 
-+ #ifdef RPI_INTER_QPU
-+     // We divide the image into blocks 256 wide and 64 high
-+@@ -4282,18 +4523,20 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     {
-+         int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+         uint32_t *p;
-++		for(job=0;job<RPI_MAX_JOBS;job++) {
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-        gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
-+ #else
-+-        gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-++          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
-+ #endif
-+-        s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-++          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
-+ 
-+-        // Set up initial locations for uniform streams
-+-        p = s->unif_mvs;
-+-        for(i = 0; i < 8; i++) {
-+-            s->mvs_base[i] = p;
-++          // Set up initial locations for uniform streams
-++          p = s->unif_mvs[job];
-++          for(i = 0; i < 8; i++) {
-++            s->mvs_base[job][i] = p;
-+             p += uv_commands_per_qpu;
-++          }
-+         }
-+         s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-+         s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-+@@ -4302,61 +4545,35 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-+     }
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-++    for(job=0;job<RPI_MAX_JOBS;job++)
-+     {
-+         int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-+         uint32_t *p;
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-++        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+ #else
-+-        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr );
-++        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+ #endif
-+-        s->y_unif_mvs = (uint32_t *) s->y_unif_mvs_ptr.arm; // TODO support this allocation in non EARLY_MALLOC
-++        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
-+ 
-+         // Set up initial locations for uniform streams
-+-        p = s->y_unif_mvs;
-++        p = s->y_unif_mvs[job];
-+         for(i = 0; i < 12; i++) {
-+-            s->y_mvs_base[i] = p;
-++            s->y_mvs_base[job][i] = p;
-+             p += y_commands_per_qpu;
-+         }
-+-        s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-+-        s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
-+-
-+     }
-++    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-++    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
-+ #endif
-+     //gpu_malloc_uncached(2048*64,&s->dummy);
-+ 
-+-#ifdef EARLY_MALLOC
-+-    {
-+-        int coeffs_in_ctb = 64*64;
-+-        int coefs_per_row = (2048/64) * coeffs_in_ctb * 3;  // Allow space for chroma
-+-        s->coeffs_buf_arm[0] = 0;
-+-        s->coeffs_buf_arm[2] = 0;
-+-        printf("Allocated %d\n",coefs_per_row);
-+-        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default);
-+-        s->coeffs_buf_arm[0] = (int16_t*) s->coeffs_buf_default.arm;
-+-        if (!s->coeffs_buf_arm[0])
-+-            goto fail;
-+-        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated);
-+-        s->coeffs_buf_arm[2] = (int16_t*) s->coeffs_buf_accelerated.arm;
-+-        s->coeffs_buf_vc[2] = s->coeffs_buf_accelerated.vc;
-+-        if (!s->coeffs_buf_arm[2])
-+-            goto fail;
-+-        s->coeffs_buf_arm[3] = coefs_per_row + s->coeffs_buf_arm[2];
-+-        s->coeffs_buf_vc[3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[2];
-+-        printf("Done\n");
-+-#ifdef RPI_PRECLEAR
-+-        //memset(s->coeffs_buf_arm[0],0, sizeof(int16_t) * coefs_per_row);
-+-        memclear16(s->coeffs_buf_arm[0], coefs_per_row);
-+-        //memset(s->coeffs_buf_arm[2],0, sizeof(int16_t) * coefs_per_row);
-+-        memclear16(s->coeffs_buf_arm[2], coefs_per_row);
-+-        //memset(s->coeffs_buf_arm[3],0, sizeof(int16_t) * coefs_per_row);
-+-        memclear16(s->coeffs_buf_arm[3], coefs_per_row);
-+-#endif
-+-    }
-+-#endif
-+-
-+     s->enable_rpi = 0;
-+ 
-++#ifdef RPI_WORKER
-++    hevc_init_worker(s);
-++#endif
-++
-+ #endif
-+ 
-+     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index aa4d218..8d72344 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -50,6 +50,12 @@
-+     // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-+     #define RPI_LUMA_QPU
-+   #endif
-++
-++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-++  #define RPI_MAX_JOBS 2
-++  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-++  #define RPI_WORKER
-++
-+ #endif
-+ 
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+@@ -832,6 +838,13 @@ typedef struct HEVCLocalContext {
-+     int boundary_flags;
-+ } HEVCLocalContext;
-+ 
-++#ifdef RPI_WORKER
-++typedef struct HEVCLocalContextIntra {
-++    TransformUnit tu;
-++    NeighbourAvailable na;
-++} HEVCLocalContextIntra;
-++#endif
-++
-+ #ifdef RPI
-+ 
-+ // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+@@ -900,7 +913,7 @@ typedef struct HEVCPredCmd {
-+ 
-+ typedef struct HEVCContext {
-+ #ifdef RPI
-+-    int dblk_cmds[RPI_MAX_DEBLOCK_CMDS][2];
-++    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+ #endif
-+     const AVClass *c;  // needed by private avoptions
-+     AVCodecContext *avctx;
-+@@ -909,7 +922,9 @@ typedef struct HEVCContext {
-+ 
-+     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
-+     HEVCLocalContext    *HEVClc;
-+-
-++#ifdef RPI_WORKER
-++    HEVCLocalContextIntra HEVClcIntra;
-++#endif
-+     uint8_t             threads_type;
-+     uint8_t             threads_number;
-+ 
-+@@ -920,43 +935,60 @@ typedef struct HEVCContext {
-+ 
-+ #ifdef RPI
-+     int enable_rpi;
-+-    HEVCMvCmd *unif_mv_cmds;
-+-    HEVCPredCmd *univ_pred_cmds;
-++    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
-++    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
-+     int buf_width;
-+-    GPU_MEM_PTR_T coeffs_buf_default;
-+-    GPU_MEM_PTR_T coeffs_buf_accelerated;
-+-    int16_t *coeffs_buf_arm[4];
-+-    unsigned int coeffs_buf_vc[4];
-+-    int num_coeffs[4];
-+-    int num_xfm_cmds;
-+-    int num_mv_cmds;
-+-    int num_pred_cmds;
-+-    int num_dblk_cmds;
-++    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
-++    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
-++    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
-++    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
-++    int num_coeffs[RPI_MAX_JOBS][4];
-++    int num_xfm_cmds[RPI_MAX_JOBS];
-++    int num_mv_cmds[RPI_MAX_JOBS];
-++    int num_pred_cmds[RPI_MAX_JOBS];
-++    int num_dblk_cmds[RPI_MAX_JOBS];
-+     int vpu_id;
-+     //GPU_MEM_PTR_T dummy;
-++    int pass0_job; // Pass0 does coefficient decode
-++    int pass1_job; // Pass1 does pixel processing
-++    int pass2_job; // Pass2 does reconstruction and deblocking
-+ #ifdef RPI_INTER_QPU
-+-    GPU_MEM_PTR_T unif_mvs_ptr;
-+-    uint32_t *unif_mvs; // Base of memory for motion vector commands
-++    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-++    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+ 
-+     // _base pointers are to the start of the row
-+-    uint32_t *mvs_base[8];
-++    uint32_t *mvs_base[RPI_MAX_JOBS][8];
-+     // these pointers are to the next free space
-+-    uint32_t *u_mvs[8];
-++    uint32_t *u_mvs[RPI_MAX_JOBS][8];
-+     // Function pointers
-+     uint32_t mc_filter_uv;
-+     uint32_t mc_filter_uv_b0;
-+     uint32_t mc_filter_uv_b;
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-+-    GPU_MEM_PTR_T y_unif_mvs_ptr;
-+-    uint32_t *y_unif_mvs; // Base of memory for motion vector commands
-+-    uint32_t *y_mvs_base[12];
-+-    uint32_t *y_mvs[12];
-++    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
-++    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-++    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-++    uint32_t *y_mvs[RPI_MAX_JOBS][12];
-+     // Function pointers
-+     uint32_t mc_filter;
-+     uint32_t mc_filter_b;
-+ #endif
-+ 
-++#ifdef RPI_WORKER
-++    pthread_t worker_thread;
-++    pthread_t worker_deblock_thread;
-++    pthread_cond_t worker_cond_head;
-++    pthread_cond_t worker_cond_tail;
-++    pthread_cond_t worker_cond_middle;
-++    pthread_mutex_t worker_mutex;
-++
-++    int worker_tail; // Contains the number of posted jobs
-++    int worker_head; // Contains the number of completed jobs
-++    int worker_middle; // Contains the number of completed jobs
-++    int kill_worker; // set to 1 to terminate the worker
-++#endif
-++
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index ca76cb0..b9f773b 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1051,11 +1051,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     if (s->enable_rpi) {
-+         int n = trafo_size * trafo_size;
-+         if (use_vpu) {
-+-            coeffs = s->coeffs_buf_arm[log2_trafo_size - 2] + s->num_coeffs[log2_trafo_size - 2];
-+-            s->num_coeffs[log2_trafo_size - 2] += n;
-++            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-++            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-+         } else {
-+-            coeffs = s->coeffs_buf_arm[0] + s->num_coeffs[0];
-+-            s->num_coeffs[0] += n;
-++            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-++            s->num_coeffs[s->pass0_job][0] += n;
-+         }
-+     }
-+     // We now do the memset after transform_add while we know the data is cached.
-+@@ -1508,7 +1508,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+             }
-+         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-+-            s->hevcdsp.idct_4x4_luma(coeffs);
-++           s->hevcdsp.idct_4x4_luma(coeffs);
-+         } else {
-+ #ifdef RPI
-+             if (!use_vpu) {
-+@@ -1553,7 +1553,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     }
-+ #ifdef RPI
-+     if (s->enable_rpi) {
-+-        HEVCPredCmd *cmd = s->univ_pred_cmds + s->num_pred_cmds++;
-++        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+         cmd->type = RPI_PRED_TRANSFORM_ADD;
-+         cmd->size = log2_trafo_size;
-+         cmd->buf = coeffs;
-+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-+index 71c6d52..344e021 100644
-+--- a/libavcodec/hevcpred_template.c
-++++ b/libavcodec/hevcpred_template.c
-+@@ -71,8 +71,11 @@ do {                                  \
-+                 AV_WN4P(&ptr[i], a);                                           \
-+             else                                                               \
-+                 a = PIXEL_SPLAT_X4(ptr[i + 3])
-+-
-++#ifdef RPI_WORKER
-++    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-++#else
-+     HEVCLocalContext *lc = s->HEVClc;
-++#endif
-+     int i;
-+     int hshift = s->ps.sps->hshift[c_idx];
-+     int vshift = s->ps.sps->vshift[c_idx];
-+-- 
-+2.5.0
-+
-+
-+From e3604dee43bae2083ecea8b578da9878a7877f1f Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 13:43:48 +0100
-+Subject: [PATCH 54/68] Avoid lockup bug with RPI_WORKER enabled
-+
-+---
-+ libavcodec/hevc.c       | 22 +++++++++++-----------
-+ libavcodec/hevc_cabac.c |  1 -
-+ 2 files changed, 11 insertions(+), 12 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 594340a..323d5f9 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -131,11 +131,11 @@ static uint32_t get_vc_address(AVBufferRef *bref) {
-+ static void worker_submit_job(HEVCContext *s)
-+ {
-+   LOG_ENTER
-+-  //pthread_mutex_lock(&s->worker_mutex);
-+-  s->worker_tail++; // This is the only place that can change tail so we do not need the mutex
-++  pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_tail++;
-+   s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+   pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-+-  //pthread_mutex_unlock(&s->worker_mutex);
-++  pthread_mutex_unlock(&s->worker_mutex);
-+   LOG_EXIT
-+ }
-+ 
-+@@ -143,11 +143,11 @@ static void worker_submit_job(HEVCContext *s)
-+ static void worker_complete_middle_job(HEVCContext *s)
-+ {
-+   LOG_ENTER
-+-  //pthread_mutex_lock(&s->worker_mutex);
-+-  s->worker_middle++; // This is the only place that can change head so we do not need the mutex
-++  pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_middle++;
-+   s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+-  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the tail has moved
-+-  //pthread_mutex_unlock(&s->worker_mutex);
-++  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
-++  pthread_mutex_unlock(&s->worker_mutex);
-+   LOG_EXIT
-+ }
-+ 
-+@@ -155,11 +155,11 @@ static void worker_complete_middle_job(HEVCContext *s)
-+ static void worker_complete_job(HEVCContext *s)
-+ {
-+   LOG_ENTER
-+-  //pthread_mutex_lock(&s->worker_mutex);
-+-  s->worker_head++; // This is the only place that can change head so we do not need the mutex
-++  pthread_mutex_lock(&s->worker_mutex);
-++  s->worker_head++;
-+   s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+-  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the tail has moved
-+-  //pthread_mutex_unlock(&s->worker_mutex);
-++  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-++  pthread_mutex_unlock(&s->worker_mutex);
-+   LOG_EXIT
-+ }
-+ 
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index b9f773b..16e7ac3 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1497,7 +1497,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                 for (i = 0; i < 8; i++)
-+                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-+             }
-+-
-+             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
-+ 
-+             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+-- 
-+2.5.0
-+
-+
-+From 38c4114cd6d6335dde5a4ba86f0f6e1a1529ad6a Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 15:37:19 +0100
-+Subject: [PATCH 55/68] Added code to flush buffers at start of frame
-+
-+---
-+ libavcodec/hevc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
-+ 1 file changed, 72 insertions(+)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 323d5f9..aa72f97 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -41,6 +41,7 @@
-+ 
-+ #ifdef RPI
-+   #include "rpi_qpu.h"
-++  #include "rpi_user_vcsm.h"
-+   // Move Inter prediction into separate pass
-+   #define RPI_INTER
-+ 
-+@@ -3495,6 +3496,7 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ #else
-+     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+ #endif
-++
-+     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+@@ -3545,6 +3547,71 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ }
-+ #endif
-+ 
-++#ifdef RPI
-++
-++static void flush_buffer(AVBufferRef *bref) {
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-++    gpu_cache_flush(p);
-++}
-++
-++static void flush_frame(HEVCContext *s,AVFrame *frame)
-++{
-++#if 1
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    int n = s->ps.sps->height;
-++    int curr_y = 0;
-++    int curr_uv = 0;
-++    int n_uv = n >> s->ps.sps->vshift[1];
-++    int sz,base;
-++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-++    base = s->frame->linesize[1] * curr_uv;
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-++    iocache.s[0].handle = p->vcsm_handle;
-++    iocache.s[0].cmd = 3; // clean+invalidate
-++    iocache.s[0].addr = p->arm + base;
-++    iocache.s[0].size  = sz;
-++    p = av_buffer_pool_opaque(frame->buf[2]);
-++    iocache.s[1].handle = p->vcsm_handle;
-++    iocache.s[1].cmd = 3; // clean+invalidate
-++    iocache.s[1].addr = p->arm + base;
-++    iocache.s[1].size  = sz;
-++    p = av_buffer_pool_opaque(frame->buf[0]);
-++    sz = s->frame->linesize[0] * (n-curr_y);
-++    base = s->frame->linesize[0] * curr_y;
-++    iocache.s[2].handle = p->vcsm_handle;
-++    iocache.s[2].cmd = 3; // clean+invalidate
-++    iocache.s[2].addr = p->arm + base;
-++    iocache.s[2].size  = sz;
-++    vcsm_clean_invalid( &iocache );
-++#else
-++    flush_buffer(frame->buf[0]);
-++    flush_buffer(frame->buf[1]);
-++    flush_buffer(frame->buf[2]);
-++#endif
-++}
-++
-++static void flush_all(HEVCContext *s)
-++{
-++#if 0
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
-++    iocache.s[0].handle = p->vcsm_handle;
-++    iocache.s[0].cmd = 4; // Flush all
-++    iocache.s[0].addr = p->arm;
-++    iocache.s[0].size  = 4096;
-++    vcsm_clean_invalid( &iocache );
-++#else
-++  int i,k;
-++  for(i=0;i<2;i++) {
-++    for (k = 0; k < s->sh.nb_refs[i]; k++) {
-++      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
-++    }
-++  }
-++  flush_frame(s,s->frame);
-++#endif
-++}
-++#endif
-++
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ {
-+     HEVCContext *s  = avctxt->priv_data;
-+@@ -3579,8 +3646,12 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         printf("Weighted B slice\n");
-+     }
-+ 
-++    // Now flush all reference frames and our destination frame to get everything ready for decode
-++    flush_all(s);
-+ #endif
-+ 
-++    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-++
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+         return AVERROR_INVALIDDATA;
-+@@ -3651,6 +3722,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+             rpi_do_all_passes(s);
-+ #endif
-+           }
-++
-+         }
-+ #endif
-+ 
-+-- 
-+2.5.0
-+
-+
-+From b279851bb85b1fe15355603dcd53c3f1b6f06724 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 3 Jun 2015 16:42:24 +0100
-+Subject: [PATCH 56/68] Reduce the amount that needs to be flushed
-+
-+---
-+ libavcodec/hevc.c | 35 +++++++++++------------------------
-+ 1 file changed, 11 insertions(+), 24 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index aa72f97..a2ba177 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3556,7 +3556,7 @@ static void flush_buffer(AVBufferRef *bref) {
-+ 
-+ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ {
-+-#if 1
-++#ifdef RPI_FAST_CACHEFLUSH
-+     struct vcsm_user_clean_invalid_s iocache = {};
-+     int n = s->ps.sps->height;
-+     int curr_y = 0;
-+@@ -3590,26 +3590,6 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ #endif
-+ }
-+ 
-+-static void flush_all(HEVCContext *s)
-+-{
-+-#if 0
-+-    struct vcsm_user_clean_invalid_s iocache = {};
-+-    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[0]);
-+-    iocache.s[0].handle = p->vcsm_handle;
-+-    iocache.s[0].cmd = 4; // Flush all
-+-    iocache.s[0].addr = p->arm;
-+-    iocache.s[0].size  = 4096;
-+-    vcsm_clean_invalid( &iocache );
-+-#else
-+-  int i,k;
-+-  for(i=0;i<2;i++) {
-+-    for (k = 0; k < s->sh.nb_refs[i]; k++) {
-+-      flush_frame(s,s->ref->refPicList[i].ref[k]->frame);
-+-    }
-+-  }
-+-  flush_frame(s,s->frame);
-+-#endif
-+-}
-+ #endif
-+ 
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+@@ -3645,9 +3625,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-+         printf("Weighted B slice\n");
-+     }
-+-
-+-    // Now flush all reference frames and our destination frame to get everything ready for decode
-+-    flush_all(s);
-+ #endif
-+ 
-+     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-+@@ -4053,6 +4030,11 @@ static int hevc_frame_start(HEVCContext *s)
-+     if (!s->avctx->hwaccel)
-+         ff_thread_finish_setup(s->avctx);
-+ 
-++#ifdef RPI_INTER_QPU
-++    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
-++    flush_frame(s,s->frame);
-++#endif
-++
-+     return 0;
-+ 
-+ fail:
-+@@ -4254,6 +4236,11 @@ fail:
-+         ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-++    } else if (s->ref) {
-++#ifdef RPI_INTER_QPU
-++      // When running single threaded we need to flush the whole frame
-++      flush_frame(s,s->frame);
-++#endif
-+     }
-+     return ret;
-+ }
-+-- 
-+2.5.0
-+
-+
-+From 7475c16d1b6b4ce94bb65f42bf3ae26969d4abf4 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 4 Jun 2015 07:59:28 +0100
-+Subject: [PATCH 57/68] Corrected support for disabled rpi when using
-+ RPI_WORKER
-+
-+---
-+ libavcodec/hevc.h              | 18 ++++++++++--------
-+ libavcodec/hevcpred_template.c |  2 +-
-+ 2 files changed, 11 insertions(+), 9 deletions(-)
-+
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 8d72344..83b0e58 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -795,7 +795,17 @@ typedef struct HEVCPacket {
-+     int nals_allocated;
-+ } HEVCPacket;
-+ 
-++#ifdef RPI_WORKER
-++typedef struct HEVCLocalContextIntra {
-++    TransformUnit tu;
-++    NeighbourAvailable na;
-++} HEVCLocalContextIntra;
-++#endif
-++
-+ typedef struct HEVCLocalContext {
-++    TransformUnit tu;
-++    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
-++
-+     uint8_t cabac_state[HEVC_CONTEXTS];
-+ 
-+     uint8_t stat_coeff[4];
-+@@ -810,7 +820,6 @@ typedef struct HEVCLocalContext {
-+ 
-+     int qPy_pred;
-+ 
-+-    TransformUnit tu;
-+ 
-+     uint8_t ctb_left_flag;
-+     uint8_t ctb_up_flag;
-+@@ -827,7 +836,6 @@ typedef struct HEVCLocalContext {
-+     int ct_depth;
-+     CodingUnit cu;
-+     PredictionUnit pu;
-+-    NeighbourAvailable na;
-+ 
-+ #define BOUNDARY_LEFT_SLICE     (1 << 0)
-+ #define BOUNDARY_LEFT_TILE      (1 << 1)
-+@@ -838,12 +846,6 @@ typedef struct HEVCLocalContext {
-+     int boundary_flags;
-+ } HEVCLocalContext;
-+ 
-+-#ifdef RPI_WORKER
-+-typedef struct HEVCLocalContextIntra {
-+-    TransformUnit tu;
-+-    NeighbourAvailable na;
-+-} HEVCLocalContextIntra;
-+-#endif
-+ 
-+ #ifdef RPI
-+ 
-+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-+index 344e021..325b60e 100644
-+--- a/libavcodec/hevcpred_template.c
-++++ b/libavcodec/hevcpred_template.c
-+@@ -72,7 +72,7 @@ do {                                  \
-+             else                                                               \
-+                 a = PIXEL_SPLAT_X4(ptr[i + 3])
-+ #ifdef RPI_WORKER
-+-    HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-++    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
-+ #else
-+     HEVCLocalContext *lc = s->HEVClc;
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From 665b1e12a132f7ea798472d46200ad930abe2a82 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 4 Jun 2015 11:52:55 +0100
-+Subject: [PATCH 58/68] Draft support for tiles
-+
-+---
-+ libavcodec/hevc.c              | 140 +++++++++++++++++++++++------------------
-+ libavcodec/hevc.h              |  22 +++++--
-+ libavcodec/hevc_filter.c       |   2 +-
-+ libavcodec/hevcpred_template.c |   2 +-
-+ 4 files changed, 100 insertions(+), 66 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index a2ba177..f3f5fdb 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -61,10 +61,10 @@
-+ 
-+   static void rpi_execute_dblk_cmds(HEVCContext *s);
-+   static void rpi_execute_transform(HEVCContext *s);
-+-  static void rpi_execute_inter_qpu(HEVCContext *s);
-++  static void rpi_launch_vpu_qpu(HEVCContext *s);
-+   static void rpi_execute_pred_cmds(HEVCContext *s);
-+   static void rpi_execute_inter_cmds(HEVCContext *s);
-+-  static void rpi_inter_clear(HEVCContext *s);
-++  static void rpi_begin(HEVCContext *s);
-+ 
-+   // Define INTER_PASS0 to do inter prediction in first pass
-+   //#define INTER_PASS0
-+@@ -88,16 +88,18 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
-+ 
-+ #ifdef RPI_INTER_QPU
-+ 
-++// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
-++// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
-++// For each block of 64*64 the smallest block size is 8x4
-++// We also need an extra command for the setup information
-++
-+ #define RPI_CHROMA_COMMAND_WORDS 12
-+-#define UV_COMMANDS_PER_QPU ((1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS)
-++#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
-+ // The QPU code for UV blocks only works up to a block width of 8
-+ #define RPI_CHROMA_BLOCK_WIDTH 8
-+ 
-+-// Split image of 2048 into parts 64 wide
-+-// So some QPUs will have 3 blocks of 64 to do, and others 2 blocks for an image 2048 wide with 32 blocks across
-+-// For each block of 64*64 the smallest block size is 8x4
-+ #define RPI_LUMA_COMMAND_WORDS 9
-+-#define Y_COMMANDS_PER_QPU ((1+3*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-++#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-+ 
-+ #define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+ 
-+@@ -214,7 +216,7 @@ static void *worker_start(void *arg)
-+     LOG_ENTER
-+     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+ #ifndef LAUNCH_PASS0
-+-    rpi_execute_inter_qpu(s);
-++    rpi_launch_vpu_qpu(s);
-+ #endif
-+ #ifndef INTER_PASS0
-+     // Perform inter prediction
-+@@ -320,9 +322,14 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+ 
-+ #ifdef RPI
-+     av_assert0(sps);
-+-    int coeffs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+-    int coefs_per_row = sps->ctb_width * coeffs_in_ctb * 3;  // Allow space for chroma
-++    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-++    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-++    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-++    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-+     int job;
-++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-++    s->ctu_per_y_chan = s->max_ctu_count / 12;
-++    s->ctu_per_uv_chan = s->max_ctu_count / 8;
-+     for(job=0;job<RPI_MAX_JOBS;job++) {
-+       printf("Allocated %d\n",coefs_per_row);
-+       for(job=0;job<RPI_MAX_JOBS;job++) {
-+@@ -2173,10 +2180,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+             int x1 = x0 + (mv->x >> 2);
-+             int y1 = y0 + (mv->y >> 2);
-+-            int chan = x0>>6; // 64 wide blocks per QPU
-+             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+-            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-++            uint32_t *y = s->curr_y_mvs;
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                   int bw = nPbW-start_x;
-+@@ -2196,7 +2202,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                 }
-+             }
-+-            s->y_mvs[s->pass0_job][chan % 12] = y;
-++            s->curr_y_mvs = y;
-+         } else
-+ #endif
-+         {
-+@@ -2220,12 +2226,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+ 
-+                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+-                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+-                int chan = x0>>8;
-+                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+-                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-++                uint32_t *u = s->curr_u_mvs;
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2249,7 +2253,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[s->pass0_job][chan & 7] = u;
-++                s->curr_u_mvs = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2276,10 +2280,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+             int x1 = x0 + (mv->x >> 2);
-+             int y1 = y0 + (mv->y >> 2);
-+-            int chan = x0>>6; // 64 wide blocks per QPU
-+             int weight_flag = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                               (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+-            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-++            uint32_t *y = s->curr_y_mvs;
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                   int bw = nPbW-start_x;
-+@@ -2299,7 +2302,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                 }
-+             }
-+-            s->y_mvs[s->pass0_job][chan % 12] = y;
-++            s->curr_y_mvs = y;
-+         } else
-+ #endif
-+ 
-+@@ -2324,12 +2327,10 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+ 
-+                 int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                 int y1_c = y0_c + (mv->y >> (2 + hshift));
-+-                //int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+-                int chan = x0>>8;
-+                 int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+                                        (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
-+ 
-+-                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-++                uint32_t *u = s->curr_u_mvs;
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2354,7 +2355,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[s->pass0_job][chan & 7] = u;
-++                s->curr_u_mvs = u;
-+                 return;
-+             }
-+ #endif
-+@@ -2387,8 +2388,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+             int y1 = y0 + (mv->y >> 2);
-+             int x2 = x0 + (mv2->x >> 2);
-+             int y2 = y0 + (mv2->y >> 2);
-+-            int chan = x0>>6; // 64 wide blocks per QPU
-+-            uint32_t *y = s->y_mvs[s->pass0_job][chan % 12];
-++            uint32_t *y = s->curr_y_mvs;
-+             for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+               for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                   int bw = nPbW-start_x;
-+@@ -2404,7 +2404,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                   y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-+                 }
-+             }
-+-            s->y_mvs[s->pass0_job][chan % 12] = y;
-++            s->curr_y_mvs = y;
-+         } else
-+ #endif
-+         {
-+@@ -2435,9 +2435,8 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                 int x2_c = x0_c + (mv2->x >> (2 + hshift));
-+                 int y2_c = y0_c + (mv2->y >> (2 + hshift));
-+ 
-+-                int chan = x0>>8; // Allocate commands for the first 256 luma pixels across to the first QPU.  This is optimised for images around 1920 width
-+ 
-+-                uint32_t *u = s->u_mvs[s->pass0_job][chan & 7];
-++                uint32_t *u = s->curr_u_mvs;
-+                 for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                   for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                       int bw = nPbW_c-start_x;
-+@@ -2466,7 +2465,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-+                       *u++ = (get_vc_address(s->frame->buf[2]) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                     }
-+                 }
-+-                s->u_mvs[s->pass0_job][chan & 7] = u;
-++                s->curr_u_mvs = u;
-+                 return;
-+             }
-+ #endif
-+@@ -3101,12 +3100,8 @@ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ 
-+ static void rpi_do_all_passes(HEVCContext *s)
-+ {
-+-#ifdef RPI_INTER_QPU
-+-    // Kick off inter prediction on QPUs
-+-    rpi_execute_inter_qpu(s);
-+-#else
-+-    rpi_execute_transform(s);
-+-#endif
-++    // Kick off QPUs and VPUs
-++    rpi_launch_vpu_qpu(s);
-+     // Perform luma inter prediction
-+     rpi_execute_inter_cmds(s);
-+     // Wait for transform completion
-+@@ -3115,18 +3110,18 @@ static void rpi_do_all_passes(HEVCContext *s)
-+     rpi_execute_pred_cmds(s);
-+     // Perform deblocking for CTBs in this row
-+     rpi_execute_dblk_cmds(s);
-+-#ifdef RPI_INTER_QPU
-+-    rpi_inter_clear(s);
-+-#endif
-++    // Prepare next batch
-++    rpi_begin(s);
-+ }
-+ 
-+ #endif
-+ 
-+-#ifdef RPI_INTER_QPU
-+-static void rpi_inter_clear(HEVCContext *s)
-++#ifdef RPI
-++static void rpi_begin(HEVCContext *s)
-+ {
-+     int job = s->pass0_job;
-+     int i;
-++#ifdef RPI_INTER_QPU
-+     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-+     int weight_flag      = (s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) ||
-+@@ -3152,6 +3147,8 @@ static void rpi_inter_clear(HEVCContext *s)
-+         }
-+         *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+     }
-++    s->curr_u_mvs = s->u_mvs[job][0];
-++#endif
-+ 
-+ #ifdef RPI_LUMA_QPU
-+     for(i=0;i<12;i++) {
-+@@ -3174,8 +3171,11 @@ static void rpi_inter_clear(HEVCContext *s)
-+         }
-+         *s->y_mvs[job][i]++ = 0; // Next kernel
-+     }
-++    s->curr_y_mvs = s->y_mvs[job][0];
-+ #endif
-++    s->ctu_count = 0;
-+ }
-++#endif
-+ 
-+ #ifdef RPI_SIMULATE_QPUS
-+ 
-+@@ -3446,8 +3446,9 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-+ 
-+ #endif
-+ 
-++#ifdef RPI_INTER_QPU
-+ 
-+-static void rpi_execute_inter_qpu(HEVCContext *s)
-++static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ {
-+     int k;
-+ #ifdef LAUNCH_PASS0
-+@@ -3545,6 +3546,15 @@ static void rpi_execute_inter_qpu(HEVCContext *s)
-+ 
-+ 
-+ }
-++#else
-++
-++#ifdef RPI
-++static void rpi_launch_vpu_qpu(HEVCContext *s)
-++{
-++  rpi_execute_transform(s);
-++}
-++#endif
-++
-+ #endif
-+ 
-+ #ifdef RPI
-+@@ -3604,29 +3614,20 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI
-+ #ifdef RPI_INTER_QPU
-+     s->enable_rpi = s->ps.sps->bit_depth == 8
-+-                    && s->ps.sps->width <= RPI_MAX_WIDTH
-+                     && !s->ps.pps->cross_component_prediction_enabled_flag
-+-                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1
-+                     && !(s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE);
-+ #else
-+     s->enable_rpi = s->ps.sps->bit_depth == 8
-+-                    && s->ps.sps->width <= RPI_MAX_WIDTH
-+-                    && !s->ps.pps->cross_component_prediction_enabled_flag
-+-                    && s->ps.pps->num_tile_rows <= 1 && s->ps.pps->num_tile_columns <= 1;
-++                    && !s->ps.pps->cross_component_prediction_enabled_flag;
-+ #endif
-+ 
-+     if (!s->enable_rpi) {
-+       if (s->ps.pps->cross_component_prediction_enabled_flag)
-+         printf("Cross component\n");
-+-      if (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)
-+-        printf("Tiles\n");
-+-      if (s->ps.pps->weighted_pred_flag && s->sh.slice_type == P_SLICE)
-+-        printf("Weighted P slice\n");
-+       if (s->ps.pps->weighted_bipred_flag && s->sh.slice_type == B_SLICE)
-+         printf("Weighted B slice\n");
-+     }
-+ #endif
-+-
-+     //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
-+ 
-+     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+@@ -3647,8 +3648,8 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     s->pass1_job = 0;
-+     s->pass2_job = 0;
-+ #endif
-+-#ifdef RPI_INTER_QPU
-+-    rpi_inter_clear(s);
-++#ifdef RPI
-++    rpi_begin(s);
-+ #endif
-+ 
-+     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-+@@ -3666,13 +3667,34 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+ 
-++#ifdef RPI_INTER_QPU
-++        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
-++#endif
-++#ifdef RPI_LUMA_QPU
-++        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
-++#endif
-++
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ 
-++#ifdef RPI_INTER_QPU
-++        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
-++#endif
-++#ifdef RPI_LUMA_QPU
-++        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
-++#endif
-++
-+ #ifdef RPI
-+         if (s->enable_rpi) {
-++          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
-++          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
-++          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
-++          //av_assert0(s->pass0_job>=0);
-+           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-+           s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+-          if ( (((y_ctb + ctb_size)&63) == 0) && x_ctb + ctb_size >= s->ps.sps->width) {
-++          s->ctu_count++;
-++          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
-++
-++          if ( s->ctu_count >= s->max_ctu_count ) {
-+ #ifdef RPI_WORKER
-+             if (s->used_for_ref) {
-+               // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+@@ -3680,7 +3702,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+               rpi_execute_inter_cmds(s);
-+   #endif
-+   #ifdef LAUNCH_PASS0
-+-              rpi_execute_inter_qpu(s);
-++              rpi_launch_vpu_qpu(s);
-+   #endif
-+               // Pass on this job to worker thread
-+               worker_submit_job(s);
-+@@ -3688,9 +3710,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+               worker_pass0_ready(s);
-+ 
-+               // Prepare the next batch of commands
-+-#ifdef RPI_INTER_QPU
-+-              rpi_inter_clear(s);
-+-#endif
-++              rpi_begin(s);
-+             } else {
-+               // Non-ref frame so do it all on this thread
-+               rpi_do_all_passes(s);
-+@@ -3731,7 +3751,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #endif
-+ 
-+     // Finish off any half-completed rows
-+-    if (s->enable_rpi && s->num_dblk_cmds[s->pass0_job]) {
-++    if (s->enable_rpi && s->ctu_count) {
-+         rpi_do_all_passes(s);
-+     }
-+ 
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 83b0e58..c62540d 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -849,8 +849,15 @@ typedef struct HEVCLocalContext {
-+ 
-+ #ifdef RPI
-+ 
-++// The processing is done in chunks
-++// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
-++// This is a distance of 1536 pixels across the screen
-++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
-++// but allocate more memory and increase the latency before data in the next frame can be processed
-++#define RPI_NUM_CHUNKS 1
-++
-+ // RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+-#define RPI_MAX_WIDTH 2048
-++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
-+ 
-+ // Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-+ #define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
-+@@ -914,9 +921,6 @@ typedef struct HEVCPredCmd {
-+ #endif
-+ 
-+ typedef struct HEVCContext {
-+-#ifdef RPI
-+-    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+-#endif
-+     const AVClass *c;  // needed by private avoptions
-+     AVCodecContext *avctx;
-+ 
-+@@ -954,6 +958,10 @@ typedef struct HEVCContext {
-+     int pass0_job; // Pass0 does coefficient decode
-+     int pass1_job; // Pass1 does pixel processing
-+     int pass2_job; // Pass2 does reconstruction and deblocking
-++    int ctu_count; // Number of CTUs done in pass0 so far
-++    int max_ctu_count; // Number of CTUs when we trigger a round of processing
-++    int ctu_per_y_chan; // Number of CTUs per luma QPU
-++    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
-+ #ifdef RPI_INTER_QPU
-+     GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-+     uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+@@ -962,6 +970,7 @@ typedef struct HEVCContext {
-+     uint32_t *mvs_base[RPI_MAX_JOBS][8];
-+     // these pointers are to the next free space
-+     uint32_t *u_mvs[RPI_MAX_JOBS][8];
-++    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
-+     // Function pointers
-+     uint32_t mc_filter_uv;
-+     uint32_t mc_filter_uv_b0;
-+@@ -972,6 +981,7 @@ typedef struct HEVCContext {
-+     uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+     uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-+     uint32_t *y_mvs[RPI_MAX_JOBS][12];
-++    uint32_t *curr_y_mvs; // Current uniform stream for luma
-+     // Function pointers
-+     uint32_t mc_filter;
-+     uint32_t mc_filter_b;
-+@@ -1099,6 +1109,10 @@ typedef struct HEVCContext {
-+     int sei_hflip, sei_vflip;
-+ 
-+     int picture_struct;
-++
-++#ifdef RPI
-++    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-++#endif
-+ } HEVCContext;
-+ 
-+ int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index b286bbf..1f04790 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -891,7 +891,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+         int n_uv = n >> s->ps.sps->vshift[1];
-+         int sz,base;
-+         if (curr_uv < 0) curr_uv = 0;
-+-        if (n_uv<=curr_uv) { assert(0); return; } // Should not happen
-++        if (n_uv<=curr_uv) { return; }
-+         sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+         base = s->frame->linesize[1] * curr_uv;
-+         GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-+index 325b60e..28d2653 100644
-+--- a/libavcodec/hevcpred_template.c
-++++ b/libavcodec/hevcpred_template.c
-+@@ -72,7 +72,7 @@ do {                                  \
-+             else                                                               \
-+                 a = PIXEL_SPLAT_X4(ptr[i + 3])
-+ #ifdef RPI_WORKER
-+-    HEVCLocalContextIntra *lc = s->enable_rpi ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
-++    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
-+ #else
-+     HEVCLocalContext *lc = s->HEVClc;
-+ #endif
-+-- 
-+2.5.0
-+
-+
-+From e8bf19f86fefd76f1f48d7b96bb47ec23c2802fc Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 4 Jun 2015 15:48:10 +0100
-+Subject: [PATCH 59/68] Move deblocker into second pass
-+
-+---
-+ libavcodec/hevc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++----------
-+ 1 file changed, 65 insertions(+), 14 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index f3f5fdb..bd59f02 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -65,6 +65,8 @@
-+   static void rpi_execute_pred_cmds(HEVCContext *s);
-+   static void rpi_execute_inter_cmds(HEVCContext *s);
-+   static void rpi_begin(HEVCContext *s);
-++  static void flush_frame(HEVCContext *s,AVFrame *frame);
-++  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-+ 
-+   // Define INTER_PASS0 to do inter prediction in first pass
-+   //#define INTER_PASS0
-+@@ -225,6 +227,11 @@ static void *worker_start(void *arg)
-+     // Wait for transform completion
-+     vpu_wait(s->vpu_id);
-+ 
-++    // Perform intra prediction and residual reconstruction
-++    rpi_execute_pred_cmds(s);
-++    // Perform deblocking for CTBs in this row
-++    rpi_execute_dblk_cmds(s);
-++
-+     worker_complete_middle_job(s);
-+     LOG_EXIT
-+   }
-+@@ -246,10 +253,6 @@ static void *worker_deblock_start(void *arg)
-+       break;
-+     }
-+     LOG_ENTER
-+-    // Perform intra prediction and residual reconstruction
-+-    rpi_execute_pred_cmds(s);
-+-    // Perform deblocking for CTBs in this row
-+-    rpi_execute_dblk_cmds(s);
-+ 
-+     worker_complete_job(s);
-+     LOG_EXIT
-+@@ -2970,7 +2973,7 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
-+ static void rpi_execute_dblk_cmds(HEVCContext *s)
-+ {
-+     int n;
-+-    int job = s->pass2_job;
-++    int job = s->pass1_job;
-+     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+     int (*p)[2] = s->dblk_cmds[job];
-+     for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-+@@ -3008,7 +3011,7 @@ static void rpi_execute_transform(HEVCContext *s)
-+ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ {
-+   int i;
-+-  int job = s->pass2_job;
-++  int job = s->pass1_job;
-+   HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-+ #ifdef RPI_WORKER
-+   HEVCLocalContextIntra *lc = &s->HEVClcIntra;
-+@@ -3493,11 +3496,10 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI_MULTI_MAILBOX
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-    gpu_cache_flush3(&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-+ #else
-+-    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
-+ #endif
-+-
-+     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+@@ -3600,6 +3602,60 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ #endif
-+ }
-+ 
-++static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-++{
-++#ifdef RPI_FAST_CACHEFLUSH
-++    struct vcsm_user_clean_invalid_s iocache = {};
-++    int n = s->ps.sps->height;
-++    int curr_y = 0;
-++    int curr_uv = 0;
-++    int n_uv = n >> s->ps.sps->vshift[1];
-++    int sz,base;
-++    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-++    base = s->frame->linesize[1] * curr_uv;
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-++    iocache.s[0].handle = p->vcsm_handle;
-++    iocache.s[0].cmd = 3; // clean+invalidate
-++    iocache.s[0].addr = p->arm + base;
-++    iocache.s[0].size  = sz;
-++    p = av_buffer_pool_opaque(frame->buf[2]);
-++    iocache.s[1].handle = p->vcsm_handle;
-++    iocache.s[1].cmd = 3; // clean+invalidate
-++    iocache.s[1].addr = p->arm + base;
-++    iocache.s[1].size  = sz;
-++    p = av_buffer_pool_opaque(frame->buf[0]);
-++    sz = s->frame->linesize[0] * (n-curr_y);
-++    base = s->frame->linesize[0] * curr_y;
-++    iocache.s[2].handle = p->vcsm_handle;
-++    iocache.s[2].cmd = 3; // clean+invalidate
-++    iocache.s[2].addr = p->arm + base;
-++    iocache.s[2].size  = sz;
-++
-++    iocache.s[3].handle = p0->vcsm_handle;
-++    iocache.s[3].cmd = 3; // clean+invalidate
-++    iocache.s[3].addr = (int) p0->arm;
-++    iocache.s[3].size  = p0->numbytes;
-++    if (p1) {
-++      iocache.s[4].handle = p1->vcsm_handle;
-++      iocache.s[4].cmd = 3; // clean+invalidate
-++      iocache.s[4].addr = (int) p1->arm;
-++      iocache.s[4].size  = p1->numbytes;
-++    }
-++    if (p2) {
-++      iocache.s[5].handle = p2->vcsm_handle;
-++      iocache.s[5].cmd = 3; // clean+invalidate
-++      iocache.s[5].addr = (int) p2->arm;
-++      iocache.s[5].size  = p2->numbytes;
-++    }
-++    vcsm_clean_invalid( &iocache );
-++#else
-++    flush_buffer(frame->buf[0]);
-++    flush_buffer(frame->buf[1]);
-++    flush_buffer(frame->buf[2]);
-++    gpu_cache_flush3(p0, p1, p2);
-++#endif
-++}
-++
-+ #endif
-+ 
-+ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+@@ -4050,11 +4106,6 @@ static int hevc_frame_start(HEVCContext *s)
-+     if (!s->avctx->hwaccel)
-+         ff_thread_finish_setup(s->avctx);
-+ 
-+-#ifdef RPI_INTER_QPU
-+-    // Invalidate the output data buffer so it is ready for the QPUs to write into it.
-+-    flush_frame(s,s->frame);
-+-#endif
-+-
-+     return 0;
-+ 
-+ fail:
-+-- 
-+2.5.0
-+
-+
-+From bd42b24c8f7e1f0d2bcfa476d2e1aea20aa3723e Mon Sep 17 00:00:00 2001
-+From: popcornmix <popcornmix@gmail.com>
-+Date: Thu, 4 Jun 2015 16:10:23 +0100
-+Subject: [PATCH 60/68] Change order of ctu accesses to improve qpu performance
-+
-+---
-+ libavcodec/hevc.c | 8 ++++----
-+ 1 file changed, 4 insertions(+), 4 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index bd59f02..ff93f6c 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -3724,19 +3724,19 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+ 
-+ #ifdef RPI_INTER_QPU
-+-        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan];
-++        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-+-        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan];
-++        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
-+ #endif
-+ 
-+         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+ 
-+ #ifdef RPI_INTER_QPU
-+-        s->u_mvs[s->pass0_job][s->ctu_count / s->ctu_per_uv_chan] = s->curr_u_mvs;
-++        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
-+ #endif
-+ #ifdef RPI_LUMA_QPU
-+-        s->y_mvs[s->pass0_job][s->ctu_count / s->ctu_per_y_chan] = s->curr_y_mvs;
-++        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
-+ #endif
-+ 
-+ #ifdef RPI
-+-- 
-+2.5.0
-+
-+
-+From 3ba78b5fe86fccfb132068603ad1db87ce44ab6c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 8 Jun 2015 09:36:59 +0100
-+Subject: [PATCH 61/68] Removed deblocker thread
-+
-+---
-+ libavcodec/hevc.c | 77 +++----------------------------------------------------
-+ libavcodec/hevc.h |  4 ---
-+ 2 files changed, 4 insertions(+), 77 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index ff93f6c..43f7ce5 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -68,11 +68,6 @@
-+   static void flush_frame(HEVCContext *s,AVFrame *frame);
-+   static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-+ 
-+-  // Define INTER_PASS0 to do inter prediction in first pass
-+-  //#define INTER_PASS0
-+-  // Define LAUNCH_PASS0 to launch QPU/VPU from pass0
-+-  //#define LAUNCH_PASS0
-+-
-+ #endif
-+ 
-+ // #define DISABLE_MC
-+@@ -145,24 +140,12 @@ static void worker_submit_job(HEVCContext *s)
-+ }
-+ 
-+ // Call this to say we have completed pass1
-+-static void worker_complete_middle_job(HEVCContext *s)
-+-{
-+-  LOG_ENTER
-+-  pthread_mutex_lock(&s->worker_mutex);
-+-  s->worker_middle++;
-+-  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+-  pthread_cond_broadcast(&s->worker_cond_middle); // Let people know that the middle has moved
-+-  pthread_mutex_unlock(&s->worker_mutex);
-+-  LOG_EXIT
-+-}
-+-
-+-// Call this to say we have completed pass2
-+ static void worker_complete_job(HEVCContext *s)
-+ {
-+   LOG_ENTER
-+   pthread_mutex_lock(&s->worker_mutex);
-+   s->worker_head++;
-+-  s->pass2_job = (s->pass2_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-++  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+   pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-+   pthread_mutex_unlock(&s->worker_mutex);
-+   LOG_EXIT
-+@@ -206,7 +189,7 @@ static void *worker_start(void *arg)
-+   while(1) {
-+     pthread_mutex_lock(&s->worker_mutex);
-+ 
-+-    while( !s->kill_worker && s->worker_tail - s->worker_middle <= 0)
-++    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
-+     {
-+       pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-+     }
-+@@ -217,13 +200,9 @@ static void *worker_start(void *arg)
-+     }
-+     LOG_ENTER
-+     // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+-#ifndef LAUNCH_PASS0
-+     rpi_launch_vpu_qpu(s);
-+-#endif
-+-#ifndef INTER_PASS0
-+     // Perform inter prediction
-+     rpi_execute_inter_cmds(s);
-+-#endif
-+     // Wait for transform completion
-+     vpu_wait(s->vpu_id);
-+ 
-+@@ -232,28 +211,6 @@ static void *worker_start(void *arg)
-+     // Perform deblocking for CTBs in this row
-+     rpi_execute_dblk_cmds(s);
-+ 
-+-    worker_complete_middle_job(s);
-+-    LOG_EXIT
-+-  }
-+-  return NULL;
-+-}
-+-
-+-static void *worker_deblock_start(void *arg)
-+-{
-+-  HEVCContext *s = (HEVCContext *)arg;
-+-  while(1) {
-+-    pthread_mutex_lock(&s->worker_mutex);
-+-    while( !s->kill_worker && s->worker_middle - s->worker_head <= 0)
-+-    {
-+-      pthread_cond_wait(&s->worker_cond_middle, &s->worker_mutex);
-+-    }
-+-    pthread_mutex_unlock(&s->worker_mutex);
-+-
-+-    if (s->kill_worker) {
-+-      break;
-+-    }
-+-    LOG_ENTER
-+-
-+     worker_complete_job(s);
-+     LOG_EXIT
-+   }
-+@@ -2985,11 +2942,7 @@ static void rpi_execute_dblk_cmds(HEVCContext *s)
-+ static void rpi_execute_transform(HEVCContext *s)
-+ {
-+     int i=2;
-+-#ifdef LAUNCH_PASS0
-+-    int job = s->pass0_job;
-+-#else
-+     int job = s->pass1_job;
-+-#endif
-+     //int j;
-+     //int16_t *coeffs = s->coeffs_buf_arm[i];
-+     //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+@@ -3044,11 +2997,7 @@ static void rpi_execute_pred_cmds(HEVCContext *s)
-+ 
-+ static void rpi_execute_inter_cmds(HEVCContext *s)
-+ {
-+-#ifdef INTER_PASS0
-+-    int job = s->pass0_job;
-+-#else
-+     int job = s->pass1_job;
-+-#endif
-+     HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-+     int n,cidx;
-+     AVFrame myref;
-+@@ -3454,11 +3403,7 @@ static void rpi_simulate_inter_qpu(HEVCContext *s)
-+ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ {
-+     int k;
-+-#ifdef LAUNCH_PASS0
-+-    int job = s->pass0_job;
-+-#else
-+     int job = s->pass1_job;
-+-#endif
-+     int i;
-+     uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
-+ #ifdef RPI_LUMA_QPU
-+@@ -3561,10 +3506,12 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI
-+ 
-++#ifndef RPI_FAST_CACHEFLUSH
-+ static void flush_buffer(AVBufferRef *bref) {
-+     GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+     gpu_cache_flush(p);
-+ }
-++#endif
-+ 
-+ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ {
-+@@ -3702,7 +3649,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI_WORKER
-+     s->pass0_job = 0;
-+     s->pass1_job = 0;
-+-    s->pass2_job = 0;
-+ #endif
-+ #ifdef RPI
-+     rpi_begin(s);
-+@@ -3754,12 +3700,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+ #ifdef RPI_WORKER
-+             if (s->used_for_ref) {
-+               // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+-  #ifdef INTER_PASS0
-+-              rpi_execute_inter_cmds(s);
-+-  #endif
-+-  #ifdef LAUNCH_PASS0
-+-              rpi_launch_vpu_qpu(s);
-+-  #endif
-+               // Pass on this job to worker thread
-+               worker_submit_job(s);
-+               // Make sure we have space to prepare the next job
-+@@ -3801,8 +3741,6 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+     // Wait for the worker to finish all its jobs
-+     if (s->enable_rpi) {
-+         worker_wait(s);
-+-        av_assert0(s->pass0_job==s->pass1_job);
-+-        av_assert0(s->pass1_job==s->pass2_job);
-+     }
-+ #endif
-+ 
-+@@ -4488,16 +4426,13 @@ static av_cold void hevc_init_worker(HEVCContext *s)
-+ {
-+     int err;
-+     pthread_cond_init(&s->worker_cond_head, NULL);
-+-    pthread_cond_init(&s->worker_cond_middle, NULL);
-+     pthread_cond_init(&s->worker_cond_tail, NULL);
-+     pthread_mutex_init(&s->worker_mutex, NULL);
-+ 
-+     s->worker_tail=0;
-+-    s->worker_middle=0;
-+     s->worker_head=0;
-+     s->kill_worker=0;
-+     err = pthread_create(&s->worker_thread, NULL, worker_start, s);
-+-    err = pthread_create(&s->worker_deblock_thread, NULL, worker_deblock_start, s);
-+     if (err) {
-+         printf("Failed to create worker thread\n");
-+         exit(-1);
-+@@ -4509,17 +4444,13 @@ static av_cold void hevc_exit_worker(HEVCContext *s)
-+     void *res;
-+     s->kill_worker=1;
-+     pthread_cond_broadcast(&s->worker_cond_tail);
-+-    pthread_cond_broadcast(&s->worker_cond_middle);
-+     pthread_join(s->worker_thread, &res);
-+-    pthread_join(s->worker_deblock_thread, &res);
-+ 
-+     pthread_cond_destroy(&s->worker_cond_head);
-+-    pthread_cond_destroy(&s->worker_cond_middle);
-+     pthread_cond_destroy(&s->worker_cond_tail);
-+     pthread_mutex_destroy(&s->worker_mutex);
-+ 
-+     s->worker_tail=0;
-+-    s->worker_middle=0;
-+     s->worker_head=0;
-+     s->kill_worker=0;
-+ }
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index c62540d..6c0d0b6 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -957,7 +957,6 @@ typedef struct HEVCContext {
-+     //GPU_MEM_PTR_T dummy;
-+     int pass0_job; // Pass0 does coefficient decode
-+     int pass1_job; // Pass1 does pixel processing
-+-    int pass2_job; // Pass2 does reconstruction and deblocking
-+     int ctu_count; // Number of CTUs done in pass0 so far
-+     int max_ctu_count; // Number of CTUs when we trigger a round of processing
-+     int ctu_per_y_chan; // Number of CTUs per luma QPU
-+@@ -989,15 +988,12 @@ typedef struct HEVCContext {
-+ 
-+ #ifdef RPI_WORKER
-+     pthread_t worker_thread;
-+-    pthread_t worker_deblock_thread;
-+     pthread_cond_t worker_cond_head;
-+     pthread_cond_t worker_cond_tail;
-+-    pthread_cond_t worker_cond_middle;
-+     pthread_mutex_t worker_mutex;
-+ 
-+     int worker_tail; // Contains the number of posted jobs
-+     int worker_head; // Contains the number of completed jobs
-+-    int worker_middle; // Contains the number of completed jobs
-+     int kill_worker; // set to 1 to terminate the worker
-+ #endif
-+ 
-+-- 
-+2.5.0
-+
-+
-+From d0720e2a6f21bbdf2ad1d52227ae272db4cf9dc0 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 8 Jun 2015 11:04:43 +0100
-+Subject: [PATCH 62/68] Reduced amount of output frame that is invalidated
-+
-+---
-+ libavcodec/hevc.c | 45 +++++++++++++++++++++++++++++----------------
-+ 1 file changed, 29 insertions(+), 16 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 43f7ce5..ef61788 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -66,7 +66,7 @@
-+   static void rpi_execute_inter_cmds(HEVCContext *s);
-+   static void rpi_begin(HEVCContext *s);
-+   static void flush_frame(HEVCContext *s,AVFrame *frame);
-+-  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
-++  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
-+ 
-+ #endif
-+ 
-+@@ -3441,9 +3441,9 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ 
-+ #ifdef RPI_MULTI_MAILBOX
-+ #ifdef RPI_CACHE_UNIF_MVS
-+-    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job]);
-++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
-+ #else
-+-    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL);
-++    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
-+ #endif
-+     s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+@@ -3517,6 +3517,7 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ {
-+ #ifdef RPI_FAST_CACHEFLUSH
-+     struct vcsm_user_clean_invalid_s iocache = {};
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+     int n = s->ps.sps->height;
-+     int curr_y = 0;
-+     int curr_uv = 0;
-+@@ -3524,22 +3525,21 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+     int sz,base;
-+     sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+     base = s->frame->linesize[1] * curr_uv;
-+-    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+     iocache.s[0].handle = p->vcsm_handle;
-+     iocache.s[0].cmd = 3; // clean+invalidate
-+-    iocache.s[0].addr = p->arm + base;
-++    iocache.s[0].addr = (int)(p->arm) + base;
-+     iocache.s[0].size  = sz;
-+     p = av_buffer_pool_opaque(frame->buf[2]);
-+     iocache.s[1].handle = p->vcsm_handle;
-+     iocache.s[1].cmd = 3; // clean+invalidate
-+-    iocache.s[1].addr = p->arm + base;
-++    iocache.s[1].addr = (int)(p->arm) + base;
-+     iocache.s[1].size  = sz;
-+     p = av_buffer_pool_opaque(frame->buf[0]);
-+     sz = s->frame->linesize[0] * (n-curr_y);
-+     base = s->frame->linesize[0] * curr_y;
-+     iocache.s[2].handle = p->vcsm_handle;
-+     iocache.s[2].cmd = 3; // clean+invalidate
-+-    iocache.s[2].addr = p->arm + base;
-++    iocache.s[2].addr = (int)(p->arm) + base;
-+     iocache.s[2].size  = sz;
-+     vcsm_clean_invalid( &iocache );
-+ #else
-+@@ -3549,33 +3549,46 @@ static void flush_frame(HEVCContext *s,AVFrame *frame)
-+ #endif
-+ }
-+ 
-+-static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-++static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
-+ {
-+ #ifdef RPI_FAST_CACHEFLUSH
-+     struct vcsm_user_clean_invalid_s iocache = {};
-+-    int n = s->ps.sps->height;
-+-    int curr_y = 0;
-+-    int curr_uv = 0;
-+-    int n_uv = n >> s->ps.sps->vshift[1];
-++    int n;
-++    int curr_y;
-++    int curr_uv;
-++    int n_uv;
-++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+     int sz,base;
-++    int (*d)[2] = s->dblk_cmds[job];
-++    int low=(*d)[1];
-++    int high=(*d)[1];
-++    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-++        int y = (*d)[1];
-++        low=FFMIN(low,y);
-++        high=FFMAX(high,y);
-++    }
-++    curr_y = low;
-++    n = high+(1 << s->ps.sps->log2_ctb_size);
-++    curr_uv = curr_y >> s->ps.sps->vshift[1];
-++    n_uv = n >> s->ps.sps->vshift[1];
-++
-+     sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+     base = s->frame->linesize[1] * curr_uv;
-+-    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+     iocache.s[0].handle = p->vcsm_handle;
-+     iocache.s[0].cmd = 3; // clean+invalidate
-+-    iocache.s[0].addr = p->arm + base;
-++    iocache.s[0].addr = (int)(p->arm) + base;
-+     iocache.s[0].size  = sz;
-+     p = av_buffer_pool_opaque(frame->buf[2]);
-+     iocache.s[1].handle = p->vcsm_handle;
-+     iocache.s[1].cmd = 3; // clean+invalidate
-+-    iocache.s[1].addr = p->arm + base;
-++    iocache.s[1].addr = (int)(p->arm) + base;
-+     iocache.s[1].size  = sz;
-+     p = av_buffer_pool_opaque(frame->buf[0]);
-+     sz = s->frame->linesize[0] * (n-curr_y);
-+     base = s->frame->linesize[0] * curr_y;
-+     iocache.s[2].handle = p->vcsm_handle;
-+     iocache.s[2].cmd = 3; // clean+invalidate
-+-    iocache.s[2].addr = p->arm + base;
-++    iocache.s[2].addr = (int)(p->arm) + base;
-+     iocache.s[2].size  = sz;
-+ 
-+     iocache.s[3].handle = p0->vcsm_handle;
-+-- 
-+2.5.0
-+
-+
-+From 980ce082dd1c0101e2aec64121c9de1d03a287f4 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Mon, 8 Jun 2015 11:55:29 +0100
-+Subject: [PATCH 63/68] Packed 16x16 and 32x32 into the same buffer
-+
-+---
-+ libavcodec/hevc.c       | 24 +++++++++++++++---------
-+ libavcodec/hevc_cabac.c |  9 ++++++++-
-+ libavcodec/rpi_qpu.c    |  2 +-
-+ 3 files changed, 24 insertions(+), 11 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index ef61788..8c6db35 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -297,12 +297,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+         s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+         if (!s->coeffs_buf_arm[job][0])
-+             goto fail;
-+-        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row * 2, &s->coeffs_buf_accelerated[job]);
-++        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+         s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+         s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+         if (!s->coeffs_buf_arm[job][2])
-+             goto fail;
-+-        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];
-++        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+         s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+       }
-+     }
-+@@ -2943,15 +2943,20 @@ static void rpi_execute_transform(HEVCContext *s)
-+ {
-+     int i=2;
-+     int job = s->pass1_job;
-+-    //int j;
-+-    //int16_t *coeffs = s->coeffs_buf_arm[i];
-+-    //for(j=s->num_coeffs[i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+-    //    s->hevcdsp.idct[4-2](coeffs, 16);
-+-    //}
-++    /*int j;
-++    int16_t *coeffs = s->coeffs_buf_arm[job][i];
-++    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
-++        s->hevcdsp.idct[4-2](coeffs, 16);
-++    }
-++    i=3;
-++    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
-++    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
-++        s->hevcdsp.idct[5-2](coeffs, 32);
-++    }*/
-+ 
-+     gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
-+     s->vpu_id = vpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
-+-                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3],
-++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+                                s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
-+     //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+     //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+@@ -3445,7 +3450,8 @@ static void rpi_launch_vpu_qpu(HEVCContext *s)
-+ #else
-+     flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
-+ #endif
-+-    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3], s->num_coeffs[job][3] >> 10, 0,
-++    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
-++                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                    qpu_get_fn(QPU_MC_SETUP_UV),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                    (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index 16e7ac3..271e17a 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -1051,7 +1051,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     if (s->enable_rpi) {
-+         int n = trafo_size * trafo_size;
-+         if (use_vpu) {
-+-            coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-++            // We support size 4 and size 5.
-++            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
-++            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
-++            // num_coeffs is indexed by log2_trafo_size-2
-++            if (log2_trafo_size == 4)
-++                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-++            else
-++                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
-+             s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-+         } else {
-+             coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 4480f72..0121fca 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -5,7 +5,7 @@
-+ // define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+ //#define RPI_TIME_TOTAL_VPU
-+ // define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
-+-//#define RPI_TIME_TOTAL_POSTED
-++#define RPI_TIME_TOTAL_POSTED
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+ #define RPI_ASYNC
-+ 
-+-- 
-+2.5.0
-+
-+
-+From dd561eb52a075c09da89bf20f8d18fb92123ec2c Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Thu, 25 Jun 2015 09:02:47 +0100
-+Subject: [PATCH 64/68] Moved luma deblock to VPU
-+
-+---
-+ libavcodec/hevc.c               |   18 +-
-+ libavcodec/hevc.h               |   11 +
-+ libavcodec/hevc_filter.c        |  120 ++-
-+ libavcodec/rpi_hevc_transform.h | 1802 ++++++++++++++++++++++++++++++++++++++-
-+ libavcodec/rpi_hevc_transform.s |  426 +++++++++
-+ libavcodec/rpi_qpu.c            |   12 +-
-+ libavcodec/rpi_shader.c         |    2 +-
-+ 7 files changed, 2378 insertions(+), 13 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index 8c6db35..da4bebb 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -244,6 +244,12 @@ static void pic_arrays_free(HEVCContext *s)
-+       }
-+     }
-+ #endif
-++#ifdef RPI_DEBLOCK_VPU
-++    if (s->y_setup_arm) {
-++      gpu_free(&s->y_setup_ptr);
-++      s->y_setup_arm = 0;
-++    }
-++#endif
-+     av_freep(&s->sao);
-+     av_freep(&s->deblock);
-+ 
-+@@ -281,12 +287,12 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
-+ 
-+ #ifdef RPI
-+-    av_assert0(sps);
-+     int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+     int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-+     int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-+     int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-+     int job;
-++    av_assert0(sps);
-+     s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+     s->ctu_per_y_chan = s->max_ctu_count / 12;
-+     s->ctu_per_uv_chan = s->max_ctu_count / 8;
-+@@ -307,6 +313,16 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+       }
-+     }
-+ #endif
-++#ifdef RPI_DEBLOCK_VPU
-++    s->enable_rpi_deblock = !sps->sao_enabled;
-++    s->setup_width = (sps->width+15) / 16;
-++    s->setup_height = (sps->height+15) / 16;
-++    gpu_malloc_uncached(sizeof(*s->y_setup_arm) * s->setup_width * s->setup_height, &s->y_setup_ptr); // TODO make this cached
-++    s->y_setup_arm = (void*)s->y_setup_ptr.arm;
-++    s->y_setup_vc = (void*)s->y_setup_ptr.vc;
-++    memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
-++    printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
-++#endif
-+ 
-+     s->bs_width  = (width  >> 2) + 1;
-+     s->bs_height = (height >> 2) + 1;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 6c0d0b6..c933757 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -56,6 +56,8 @@
-+   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+   #define RPI_WORKER
-+ 
-++  #define RPI_DEBLOCK_VPU
-++
-+ #endif
-+ 
-+ #define MAX_DPB_SIZE 16 // A.4.1
-+@@ -997,6 +999,15 @@ typedef struct HEVCContext {
-+     int kill_worker; // set to 1 to terminate the worker
-+ #endif
-+ 
-++#ifdef RPI_DEBLOCK_VPU
-++    int enable_rpi_deblock;
-++    GPU_MEM_PTR_T y_setup_ptr;
-++    uint8_t (*y_setup_arm)[2][2][2][4];
-++    uint8_t (*y_setup_vc)[2][2][2][4];
-++    int setup_width; // Number of 16x16 blocks across the image
-++    int setup_height; // Number of 16x16 blocks down the image
-++#endif
-++
-+ #endif
-+ 
-+     uint8_t *cabac_state;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 1f04790..06371da 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -564,6 +564,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                                                          s->frame->linesize[LUMA],
-+                                                          beta, tc, no_p, no_q);
-+                 } else
-++#ifdef RPI_DEBLOCK_VPU
-++                if (s->enable_rpi_deblock) {
-++                    uint8_t (*setup)[2][2][4];
-++                    int num16 = (y>>4)*s->setup_width + (x>>4);
-++                    int a = ((y>>3) & 1) << 1;
-++                    int b = (x>>3) & 1;
-++                    setup = s->y_setup_arm[num16];
-++                    setup[0][b][0][a] = beta;
-++                    setup[0][b][0][a + 1] = beta;
-++                    setup[0][b][1][a] = tc[0];
-++                    setup[0][b][1][a + 1] = tc[1];
-++                } else
-++#endif
-+                     s->hevcdsp.hevc_v_loop_filter_luma(src,
-+                                                        s->frame->linesize[LUMA],
-+                                                        beta, tc, no_p, no_q);
-+@@ -596,6 +609,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                                                          s->frame->linesize[LUMA],
-+                                                          beta, tc, no_p, no_q);
-+                 } else
-++#ifdef RPI_DEBLOCK_VPU
-++                if (s->enable_rpi_deblock) {
-++                    uint8_t (*setup)[2][2][4];
-++                    int num16 = (y>>4)*s->setup_width + (x>>4);
-++                    int a = ((x>>3) & 1) << 1;
-++                    int b = (y>>3) & 1;
-++                    setup = s->y_setup_arm[num16];
-++                    setup[1][b][0][a] = beta;
-++                    setup[1][b][0][a + 1] = beta;
-++                    setup[1][b][1][a] = tc[0];
-++                    setup[1][b][1][a + 1] = tc[1];
-++                } else
-++#endif
-+                     s->hevcdsp.hevc_h_loop_filter_luma(src,
-+                                                        s->frame->linesize[LUMA],
-+                                                        beta, tc, no_p, no_q);
-+@@ -876,33 +902,85 @@ static void flush_buffer(AVBufferRef *bref) {
-+ }
-+ 
-+ // Return Physical address for this image
-+-static int ff_hevc_buf_base(AVBufferRef *bref) {
-++static uint32_t get_vc_address(AVBufferRef *bref) {
-+   GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+-  return p->vc & 0x3fffffff;
-++  return p->vc;
-+ }
-+ 
-++// ff_hevc_flush_buffer_lines
-++// flushes and invalidates all pixel rows in [start,end-1]
-++static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
-++{
-++#ifdef RPI_FAST_CACHEFLUSH
-++        struct vcsm_user_clean_invalid_s iocache = {};
-++        int curr_y = start;
-++        int n = end;
-++        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-++        int n_uv = n >> s->ps.sps->vshift[1];
-++        int sz,base;
-++        GPU_MEM_PTR_T *p;
-++        if (curr_uv < 0) curr_uv = 0;
-++        if (n_uv<=curr_uv) { return; }
-++        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-++        base = s->frame->linesize[1] * curr_uv;
-++        if (flush_chroma) {
-++          p = av_buffer_pool_opaque(s->frame->buf[1]);
-++          iocache.s[0].handle = p->vcsm_handle;
-++          iocache.s[0].cmd = 3; // clean+invalidate
-++          iocache.s[0].addr = (int)p->arm + base;
-++          iocache.s[0].size  = sz;
-++          p = av_buffer_pool_opaque(s->frame->buf[2]);
-++          iocache.s[1].handle = p->vcsm_handle;
-++          iocache.s[1].cmd = 3; // clean+invalidate
-++          iocache.s[1].addr = (int)p->arm + base;
-++          iocache.s[1].size  = sz;
-++        }
-++        if (flush_luma) {
-++          p = av_buffer_pool_opaque(s->frame->buf[0]);
-++          sz = s->frame->linesize[0] * (n-curr_y);
-++          base = s->frame->linesize[0] * curr_y;
-++          iocache.s[2].handle = p->vcsm_handle;
-++          iocache.s[2].cmd = 3; // clean+invalidate
-++          iocache.s[2].addr = (int)p->arm + base;
-++          iocache.s[2].size  = sz;
-++        }
-++        vcsm_clean_invalid( &iocache );
-++#else
-++        if (flush_chroma) {
-++          flush_buffer(s->frame->buf[1]);
-++          flush_buffer(s->frame->buf[2]);
-++        }
-++        if (flush_luma) {
-++          flush_buffer(s->frame->buf[0]);
-++        }
-++#endif
-++}
-++
-++
-+ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ {
-+     if (s->enable_rpi && s->used_for_ref) {
-++      // TODO make this use ff_hevc_flush_buffer_lines
-+ #ifdef RPI_FAST_CACHEFLUSH
-+         struct vcsm_user_clean_invalid_s iocache = {};
-+         int curr_y = ((int *)f->progress->data)[0];
-+         int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+         int n_uv = n >> s->ps.sps->vshift[1];
-+         int sz,base;
-++        GPU_MEM_PTR_T *p;
-+         if (curr_uv < 0) curr_uv = 0;
-+         if (n_uv<=curr_uv) { return; }
-+         sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+         base = s->frame->linesize[1] * curr_uv;
-+-        GPU_MEM_PTR_T *p = av_buffer_pool_opaque(s->frame->buf[1]);
-++        p = av_buffer_pool_opaque(s->frame->buf[1]);
-+         iocache.s[0].handle = p->vcsm_handle;
-+         iocache.s[0].cmd = 3; // clean+invalidate
-+-        iocache.s[0].addr = p->arm + base;
-++        iocache.s[0].addr = (int)p->arm + base;
-+         iocache.s[0].size  = sz;
-+         p = av_buffer_pool_opaque(s->frame->buf[2]);
-+         iocache.s[1].handle = p->vcsm_handle;
-+         iocache.s[1].cmd = 3; // clean+invalidate
-+-        iocache.s[1].addr = p->arm + base;
-++        iocache.s[1].addr = (int)p->arm + base;
-+         iocache.s[1].size  = sz;
-+ 
-+ #ifdef RPI_LUMA_QPU
-+@@ -911,7 +989,7 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+         base = s->frame->linesize[0] * curr_y;
-+         iocache.s[2].handle = p->vcsm_handle;
-+         iocache.s[2].cmd = 3; // clean+invalidate
-+-        iocache.s[2].addr = p->arm + base;
-++        iocache.s[2].addr = (int)p->arm + base;
-+         iocache.s[2].size  = sz;
-+ #endif
-+         vcsm_clean_invalid( &iocache );
-+@@ -930,11 +1008,40 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ }
-+ #endif
-+ 
-++#ifdef RPI_DEBLOCK_VPU
-++/* rpi_deblock deblocks an entire row of ctbs using the VPU */
-++static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
-++{
-++  // Flush image, 4 lines above to bottom of ctb stripe
-++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
-++  // TODO flush buffer of beta/tc setup when it becomes cached
-++  // Call VPU
-++  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
-++  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
-++                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
-++                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
-++}
-++
-++static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
-++{
-++   int y2;
-++   for(y2=y;y2<y+ctb_size;y2+=16) {
-++      rpi_deblock(s,y2,16);
-++   }
-++}
-++#endif
-++
-+ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+ {
-+     int x_end = x >= s->ps.sps->width  - ctb_size;
-+     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
-+         deblocking_filter_CTB(s, x, y);
-++#ifdef RPI_DEBLOCK_VPU
-++    if (s->enable_rpi_deblock && x_end)
-++    {
-++      rpi_deblock(s, y, ctb_size);
-++    }
-++#endif
-+     if (s->ps.sps->sao_enabled) {
-+         int y_end = y >= s->ps.sps->height - ctb_size;
-+         if (y && x)
-+@@ -965,6 +1072,7 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+         //if (((y + ctb_size)&63)==0)
-+ #ifdef RPI_INTER_QPU
-+         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-++        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+     }
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index 4f13622..b3f155f 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -3,7 +3,13 @@ unsigned char rpi_hevc_transform [] = {
-+ 106,
-+ 0,
-+ 144,
-+-35,
-++38,
-++1,
-++37,
-++106,
-++0,
-++144,
-++57,
-+ 1,
-+ 169,
-+ 3,
-+@@ -627,4 +633,1798 @@ unsigned char rpi_hevc_transform [] = {
-+ 30,
-+ 90,
-+ 0,
-++169,
-++3,
-++73,
-++64,
-++52,
-++64,
-++45,
-++64,
-++2,
-++64,
-++10,
-++64,
-++64,
-++198,
-++1,
-++7,
-++8,
-++232,
-++63,
-++0,
-++0,
-++0,
-++6,
-++232,
-++253,
-++255,
-++255,
-++255,
-++0,
-++246,
-++0,
-++0,
-++0,
-++4,
-++215,
-++64,
-++3,
-++96,
-++2,
-++248,
-++0,
-++35,
-++0,
-++0,
-++64,
-++56,
-++0,
-++0,
-++4,
-++248,
-++0,
-++36,
-++0,
-++0,
-++64,
-++56,
-++8,
-++0,
-++0,
-++240,
-++64,
-++0,
-++132,
-++3,
-++128,
-++240,
-++0,
-++0,
-++132,
-++3,
-++128,
-++144,
-++137,
-++0,
-++131,
-++98,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++129,
-++0,
-++131,
-++102,
-++0,
-++158,
-++67,
-++0,
-++2,
-++248,
-++0,
-++35,
-++0,
-++0,
-++64,
-++56,
-++0,
-++0,
-++4,
-++248,
-++0,
-++36,
-++0,
-++0,
-++64,
-++56,
-++8,
-++0,
-++0,
-++240,
-++64,
-++0,
-++132,
-++3,
-++128,
-++240,
-++0,
-++0,
-++132,
-++3,
-++128,
-++144,
-++108,
-++0,
-++131,
-++98,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++100,
-++0,
-++131,
-++102,
-++0,
-++248,
-++64,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++248,
-++0,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++144,
-++161,
-++0,
-++188,
-++64,
-++67,
-++232,
-++0,
-++2,
-++0,
-++0,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++150,
-++0,
-++195,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++128,
-++7,
-++192,
-++130,
-++248,
-++0,
-++0,
-++112,
-++192,
-++224,
-++16,
-++195,
-++31,
-++132,
-++248,
-++1,
-++0,
-++112,
-++0,
-++224,
-++16,
-++203,
-++31,
-++3,
-++99,
-++131,
-++71,
-++68,
-++232,
-++32,
-++0,
-++0,
-++0,
-++0,
-++99,
-++2,
-++99,
-++23,
-++102,
-++7,
-++106,
-++127,
-++156,
-++182,
-++255,
-++0,
-++248,
-++64,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++248,
-++0,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++144,
-++112,
-++0,
-++188,
-++64,
-++67,
-++232,
-++0,
-++2,
-++0,
-++0,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++101,
-++0,
-++195,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++128,
-++7,
-++192,
-++130,
-++248,
-++0,
-++0,
-++112,
-++192,
-++224,
-++16,
-++195,
-++31,
-++132,
-++248,
-++1,
-++0,
-++112,
-++0,
-++224,
-++16,
-++203,
-++31,
-++25,
-++102,
-++9,
-++106,
-++2,
-++30,
-++41,
-++3,
-++26,
-++87,
-++162,
-++64,
-++64,
-++198,
-++1,
-++23,
-++127,
-++158,
-++103,
-++255,
-++239,
-++3,
-++0,
-++254,
-++0,
-++143,
-++92,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++64,
-++143,
-++93,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++143,
-++94,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++143,
-++95,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++142,
-++208,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++142,
-++209,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++64,
-++142,
-++210,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++0,
-++142,
-++211,
-++0,
-++0,
-++240,
-++12,
-++0,
-++128,
-++144,
-++107,
-++0,
-++8,
-++255,
-++99,
-++23,
-++0,
-++212,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++163,
-++23,
-++0,
-++228,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++227,
-++23,
-++0,
-++244,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++35,
-++52,
-++0,
-++180,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++99,
-++52,
-++0,
-++164,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++163,
-++52,
-++0,
-++148,
-++192,
-++51,
-++0,
-++0,
-++111,
-++3,
-++239,
-++3,
-++0,
-++254,
-++0,
-++143,
-++12,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++64,
-++143,
-++13,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++143,
-++14,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++143,
-++15,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++142,
-++16,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++142,
-++17,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++64,
-++142,
-++18,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++0,
-++142,
-++19,
-++0,
-++0,
-++240,
-++12,
-++0,
-++128,
-++144,
-++33,
-++0,
-++8,
-++255,
-++99,
-++3,
-++0,
-++212,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++163,
-++3,
-++0,
-++228,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++227,
-++3,
-++0,
-++244,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++35,
-++4,
-++0,
-++180,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++99,
-++4,
-++0,
-++164,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++163,
-++4,
-++0,
-++148,
-++192,
-++51,
-++0,
-++0,
-++111,
-++3,
-++32,
-++246,
-++192,
-++11,
-++1,
-++16,
-++32,
-++246,
-++2,
-++137,
-++47,
-++240,
-++40,
-++246,
-++2,
-++140,
-++47,
-++240,
-++128,
-++245,
-++99,
-++140,
-++5,
-++4,
-++0,
-++247,
-++99,
-++140,
-++1,
-++20,
-++88,
-++246,
-++99,
-++140,
-++1,
-++20,
-++0,
-++247,
-++35,
-++136,
-++62,
-++226,
-++32,
-++247,
-++35,
-++136,
-++32,
-++210,
-++0,
-++247,
-++34,
-++136,
-++63,
-++2,
-++208,
-++246,
-++34,
-++136,
-++0,
-++4,
-++0,
-++247,
-++99,
-++136,
-++58,
-++162,
-++32,
-++247,
-++99,
-++136,
-++33,
-++146,
-++0,
-++247,
-++98,
-++136,
-++59,
-++18,
-++208,
-++246,
-++98,
-++136,
-++0,
-++20,
-++0,
-++247,
-++162,
-++136,
-++33,
-++2,
-++88,
-++246,
-++98,
-++137,
-++2,
-++68,
-++88,
-++246,
-++162,
-++137,
-++3,
-++68,
-++208,
-++254,
-++227,
-++136,
-++60,
-++242,
-++192,
-++243,
-++188,
-++11,
-++208,
-++254,
-++227,
-++136,
-++56,
-++178,
-++192,
-++243,
-++188,
-++10,
-++32,
-++255,
-++226,
-++136,
-++38,
-++58,
-++192,
-++243,
-++60,
-++0,
-++208,
-++254,
-++227,
-++136,
-++59,
-++242,
-++192,
-++243,
-++60,
-++128,
-++32,
-++255,
-++226,
-++136,
-++49,
-++58,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++226,
-++136,
-++34,
-++34,
-++192,
-++243,
-++60,
-++128,
-++32,
-++255,
-++226,
-++136,
-++37,
-++58,
-++192,
-++243,
-++60,
-++128,
-++0,
-++254,
-++192,
-++136,
-++1,
-++4,
-++0,
-++240,
-++0,
-++160,
-++0,
-++255,
-++194,
-++8,
-++0,
-++52,
-++195,
-++243,
-++0,
-++128,
-++0,
-++255,
-++202,
-++40,
-++0,
-++52,
-++195,
-++243,
-++0,
-++128,
-++0,
-++254,
-++0,
-++240,
-++35,
-++10,
-++0,
-++240,
-++60,
-++0,
-++0,
-++254,
-++192,
-++136,
-++1,
-++4,
-++0,
-++240,
-++0,
-++160,
-++0,
-++255,
-++226,
-++140,
-++34,
-++34,
-++195,
-++243,
-++60,
-++0,
-++32,
-++255,
-++227,
-++140,
-++36,
-++58,
-++192,
-++243,
-++60,
-++0,
-++0,
-++254,
-++192,
-++136,
-++0,
-++4,
-++0,
-++240,
-++0,
-++160,
-++16,
-++246,
-++226,
-++136,
-++35,
-++50,
-++16,
-++246,
-++226,
-++136,
-++35,
-++50,
-++32,
-++246,
-++226,
-++136,
-++35,
-++50,
-++32,
-++254,
-++226,
-++136,
-++35,
-++58,
-++192,
-++243,
-++60,
-++0,
-++11,
-++96,
-++0,
-++254,
-++0,
-++240,
-++1,
-++4,
-++0,
-++240,
-++64,
-++115,
-++5,
-++106,
-++0,
-++144,
-++173,
-++1,
-++27,
-++96,
-++0,
-++254,
-++0,
-++240,
-++1,
-++4,
-++0,
-++240,
-++64,
-++147,
-++5,
-++106,
-++0,
-++144,
-++227,
-++0,
-++64,
-++246,
-++163,
-++140,
-++1,
-++4,
-++0,
-++246,
-++192,
-++175,
-++63,
-++2,
-++0,
-++246,
-++192,
-++174,
-++59,
-++2,
-++0,
-++246,
-++128,
-++175,
-++62,
-++2,
-++0,
-++246,
-++128,
-++174,
-++58,
-++2,
-++0,
-++246,
-++64,
-++175,
-++61,
-++2,
-++0,
-++246,
-++64,
-++174,
-++57,
-++2,
-++0,
-++255,
-++43,
-++240,
-++4,
-++212,
-++192,
-++243,
-++128,
-++11,
-++64,
-++254,
-++43,
-++240,
-++1,
-++228,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++240,
-++1,
-++244,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++240,
-++1,
-++180,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++141,
-++0,
-++164,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++32,
-++247,
-++35,
-++141,
-++191,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++235,
-++143,
-++52,
-++242,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++2,
-++212,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++43,
-++240,
-++191,
-++226,
-++192,
-++243,
-++188,
-++10,
-++64,
-++254,
-++43,
-++141,
-++0,
-++180,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++2,
-++68,
-++32,
-++247,
-++35,
-++141,
-++190,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++171,
-++143,
-++52,
-++226,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++4,
-++180,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++43,
-++240,
-++191,
-++226,
-++192,
-++243,
-++188,
-++10,
-++128,
-++253,
-++43,
-++240,
-++3,
-++212,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++35,
-++141,
-++1,
-++196,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++32,
-++247,
-++35,
-++141,
-++189,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++107,
-++143,
-++52,
-++210,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++4,
-++148,
-++192,
-++243,
-++128,
-++11,
-++64,
-++254,
-++43,
-++240,
-++1,
-++164,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++240,
-++1,
-++180,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++240,
-++1,
-++244,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++43,
-++141,
-++0,
-++228,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++32,
-++247,
-++35,
-++141,
-++187,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++235,
-++142,
-++52,
-++178,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++2,
-++148,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++43,
-++240,
-++187,
-++162,
-++192,
-++243,
-++188,
-++10,
-++64,
-++254,
-++43,
-++141,
-++0,
-++244,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++2,
-++68,
-++32,
-++247,
-++35,
-++141,
-++186,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++171,
-++142,
-++52,
-++162,
-++192,
-++243,
-++60,
-++128,
-++0,
-++255,
-++43,
-++240,
-++4,
-++244,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++43,
-++240,
-++187,
-++162,
-++192,
-++243,
-++188,
-++10,
-++128,
-++253,
-++43,
-++240,
-++3,
-++148,
-++192,
-++243,
-++128,
-++10,
-++64,
-++254,
-++35,
-++141,
-++1,
-++132,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++32,
-++247,
-++35,
-++141,
-++185,
-++66,
-++240,
-++246,
-++35,
-++141,
-++50,
-++66,
-++0,
-++255,
-++107,
-++142,
-++52,
-++146,
-++192,
-++243,
-++60,
-++128,
-++64,
-++255,
-++98,
-++141,
-++0,
-++52,
-++192,
-++243,
-++0,
-++0,
-++0,
-++254,
-++0,
-++240,
-++53,
-++10,
-++0,
-++240,
-++60,
-++0,
-++0,
-++254,
-++0,
-++240,
-++1,
-++4,
-++0,
-++240,
-++64,
-++147,
-++5,
-++106,
-++0,
-++144,
-++177,
-++0,
-++88,
-++246,
-++163,
-++140,
-++1,
-++4,
-++128,
-++245,
-++99,
-++141,
-++10,
-++4,
-++88,
-++246,
-++162,
-++138,
-++1,
-++68,
-++0,
-++247,
-++162,
-++138,
-++36,
-++162,
-++88,
-++254,
-++162,
-++138,
-++3,
-++164,
-++192,
-++243,
-++128,
-++11,
-++0,
-++255,
-++226,
-++137,
-++32,
-++2,
-++195,
-++243,
-++60,
-++0,
-++32,
-++247,
-++226,
-++137,
-++42,
-++114,
-++0,
-++255,
-++34,
-++138,
-++33,
-++18,
-++195,
-++243,
-++60,
-++0,
-++32,
-++247,
-++34,
-++138,
-++42,
-++130,
-++16,
-++246,
-++98,
-++138,
-++40,
-++114,
-++16,
-++246,
-++98,
-++138,
-++41,
-++146,
-++32,
-++246,
-++98,
-++138,
-++41,
-++146,
-++32,
-++246,
-++226,
-++137,
-++41,
-++146,
-++40,
-++246,
-++34,
-++138,
-++41,
-++146,
-++32,
-++247,
-++163,
-++141,
-++63,
-++178,
-++32,
-++247,
-++227,
-++141,
-++62,
-++162,
-++0,
-++254,
-++0,
-++240,
-++8,
-++4,
-++0,
-++240,
-++128,
-++11,
-++128,
-++253,
-++35,
-++240,
-++9,
-++100,
-++192,
-++243,
-++128,
-++10,
-++128,
-++253,
-++163,
-++141,
-++128,
-++115,
-++192,
-++243,
-++152,
-++10,
-++88,
-++246,
-++163,
-++141,
-++4,
-++100,
-++208,
-++246,
-++35,
-++139,
-++0,
-++100,
-++32,
-++255,
-++34,
-++139,
-++53,
-++202,
-++192,
-++243,
-++60,
-++128,
-++0,
-++254,
-++0,
-++139,
-++0,
-++4,
-++0,
-++240,
-++0,
-++160,
-++240,
-++246,
-++163,
-++141,
-++48,
-++98,
-++0,
-++247,
-++99,
-++139,
-++63,
-++210,
-++0,
-++247,
-++98,
-++139,
-++1,
-++212,
-++88,
-++254,
-++98,
-++139,
-++1,
-++212,
-++192,
-++243,
-++128,
-++11,
-++32,
-++255,
-++99,
-++139,
-++62,
-++98,
-++192,
-++243,
-++188,
-++10,
-++88,
-++246,
-++98,
-++139,
-++1,
-++212,
-++240,
-++246,
-++98,
-++139,
-++50,
-++210,
-++0,
-++247,
-++163,
-++128,
-++59,
-++146,
-++0,
-++247,
-++160,
-++128,
-++1,
-++36,
-++88,
-++254,
-++160,
-++128,
-++1,
-++36,
-++192,
-++243,
-++128,
-++11,
-++0,
-++247,
-++163,
-++128,
-++58,
-++98,
-++64,
-++255,
-++35,
-++240,
-++0,
-++100,
-++192,
-++243,
-++128,
-++10,
-++64,
-++255,
-++163,
-++128,
-++0,
-++164,
-++192,
-++243,
-++128,
-++10,
-++88,
-++246,
-++160,
-++128,
-++1,
-++36,
-++240,
-++246,
-++160,
-++128,
-++50,
-++34,
-++8,
-++255,
-++227,
-++143,
-++54,
-++242,
-++192,
-++243,
-++60,
-++128,
-++40,
-++255,
-++227,
-++142,
-++54,
-++178,
-++192,
-++243,
-++60,
-++128,
-++0,
-++254,
-++0,
-++240,
-++39,
-++10,
-++0,
-++240,
-++60,
-++128,
-++8,
-++255,
-++163,
-++143,
-++45,
-++226,
-++192,
-++243,
-++60,
-++128,
-++0,
-++254,
-++0,
-++240,
-++44,
-++10,
-++0,
-++240,
-++60,
-++0,
-++0,
-++254,
-++0,
-++240,
-++40,
-++10,
-++0,
-++240,
-++60,
-++128,
-++8,
-++255,
-++163,
-++142,
-++2,
-++162,
-++192,
-++243,
-++60,
-++128,
-++90,
-++0,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index fd159bc..b055208 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -83,6 +83,8 @@
-+ hevc_trans_16x16:
-+   cmp r5,1
-+   beq memclear16
-++  cmp r5,2
-++  beq hevc_deblock_16x16
-+   push r6-r15, lr # TODO cut down number of used registers
-+   mov r14,r3 # coeffs32
-+   mov r15,r4 # num32
-+@@ -282,3 +284,427 @@ loop:
-+   cmp r1,0
-+   bgt loop
-+   b lr
-++
-++
-++################################################################################
-++# HEVC VPU Deblock
-++#
-++# Vertical edges before horizontal
-++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
-++#
-++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
-++# The VPU code works in units of 16x16 blocks.
-++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
-++# One final horizontal filter is required at the end.
-++# PCM is not allowed in this code.
-++#
-++#
-++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
-++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
-++
-++.set P0,63
-++.set P1,62
-++.set P2,61
-++.set P3,60
-++.set Q0,59
-++.set Q1,58
-++.set Q2,57
-++.set Q3,56
-++
-++.set dp,32
-++.set dq,33
-++.set d,34
-++.set decision,35
-++.set beta,36
-++.set beta2,37
-++.set beta3,38
-++.set ptest,39
-++.set qtest,40
-++.set pqtest,41
-++.set thresh,42
-++.set deltatest, 44
-++.set deltap1, 45
-++.set tc25, 46
-++.set setup,47
-++.set tc,48
-++.set tc25,49
-++.set tc2, 50
-++.set do_filter, 51
-++.set delta, 52
-++.set tc10, 53
-++.set delta0, 54
-++.set delta1, 55
-++.set zeros, 0
-++.set setup_input, 1
-++.set deltaq1, 2
-++
-++
-++
-++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
-++# Row has num16 16x16 blocks across
-++# Beta goes from 0 to 64
-++# tc goes from 0 to 24
-++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
-++#   has 8 bytes per edge
-++#   has 16 bytes per direction
-++#   has 32 bytes per 16x16 block
-++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
-++hevc_deblock_16x16:
-++  push r6-r15, lr
-++  mov r9,r4
-++  mov r4,r3
-++  mov r13,r2
-++  mov r2,r0
-++  mov r10,r0
-++  subscale4 r0,r1
-++  mov r8,63
-++  mov r6,-3
-++  vmov H(zeros,0),0
-++# r7 is number of blocks still to load
-++# r0 is location of current block - 4 * stride
-++# r1 is stride
-++# r2 is location of current block
-++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-++# r4 is setup
-++# r5 is for temporary calculations
-++# r8 holds 63
-++# r6 holds -3
-++# r9 holds the number of 16 high rows to process
-++# r10 holds the original img base
-++# r11 returns 0 if no filtering was done on the edge
-++# r12 saves a copy of this
-++# r13 is copy of width
-++
-++process_row:
-++  # First iteration does not do horizontal filtering on previous
-++  mov r7, r13
-++  mov r3,0
-++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-++  vldb H(16++,16)+r3,(r2 += r1) REP 16
-++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-++  vstb H(zeros,0),(r4)
-++  bl vert_filter
-++  add r3,8
-++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-++  bl vert_filter
-++  sub r3,8
-++  b start_deblock_loop
-++deblock_loop:
-++  # Middle iterations do vertical on current block and horizontal on preceding
-++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-++  vldb H(16++,16)+r3,(r2 += r1) REP 16
-++  vldb H(setup_input,0), (r4)
-++  vstb H(zeros,0),(r4)
-++  bl vert_filter
-++  add r3,8
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl vert_filter
-++  sub r3,8
-++  vldb H(setup_input,0), -16(r4)
-++  vstb H(zeros,0),-16(r4)
-++  bl horz_filter
-++  mov r12,r11
-++  add r3,8*64
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl horz_filter
-++  sub r3,8*64
-++  addcmpbeq r12,0,0,skip_save_top
-++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-++skip_save_top:
-++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-++start_deblock_loop:
-++  # move onto next 16x16 (could do this with circular buffer support instead)
-++  add r3,16
-++  and r3,r8
-++  add r4,32
-++  # Perform loop counter operations (may work with an addcmpbgt as well?)
-++  add r0,16
-++  add r2,16
-++  sub r7,1
-++  cmp r7,0 # Are there still more blocks to load
-++  bgt deblock_loop
-++
-++  # Final iteration needs to just do horizontal filtering
-++  vldb H(setup_input,0), -16(r4)
-++  vstb H(zeros,0),-16(r4)
-++  bl horz_filter
-++  mov r12,r11
-++  add r3,8*64
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl horz_filter
-++  sub r3,64*8
-++  addcmpbeq r12,0,0,skip_save_top2
-++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-++skip_save_top2:
-++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-++
-++# Now look to see if we should do another row
-++  sub r9,1
-++  cmp r9,0
-++  bgt start_again
-++  pop r6-r15, pc
-++start_again:
-++  # Need to sort out r0,r2 to point to next row down
-++  addscale16 r10,r1
-++  mov r2,r10
-++  subscale4 r0,r2,r1
-++  b process_row
-++
-++
-++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-++
-++vert_filter:
-++  push lr
-++
-++  vmov HX(P3,0), V(16,12)+r3
-++  vmov HX(P2,0), V(16,13)+r3
-++  vmov HX(P1,0), V(16,14)+r3
-++  vmov HX(P0,0), V(16,15)+r3
-++  vmov HX(Q0,0), V(16,16)+r3
-++  vmov HX(Q1,0), V(16,17)+r3
-++  vmov HX(Q2,0), V(16,18)+r3
-++  vmov HX(Q3,0), V(16,19)+r3
-++
-++  bl do_luma_filter
-++
-++  vadds V(16,13)+r3, HX(P2,0), 0
-++  vadds V(16,14)+r3, HX(P1,0), 0
-++  vadds V(16,15)+r3, HX(P0,0), 0
-++  # P3 and Q3 never change so don't bother saving back
-++  vadds V(16,16)+r3, HX(Q0,0), 0
-++  vadds V(16,17)+r3, HX(Q1,0), 0
-++  vadds V(16,18)+r3, HX(Q2,0), 0
-++
-++  pop pc
-++
-++# Filter edge at H(16,0)+r3
-++horz_filter:
-++  push lr
-++
-++  vmov HX(P3,0), H(12,0)+r3
-++  vmov HX(P2,0), H(13,0)+r3
-++  vmov HX(P1,0), H(14,0)+r3
-++  vmov HX(P0,0), H(15,0)+r3
-++  vmov HX(Q0,0), H(16,0)+r3
-++  vmov HX(Q1,0), H(17,0)+r3
-++  vmov HX(Q2,0), H(18,0)+r3
-++  vmov HX(Q3,0), H(19,0)+r3
-++
-++  bl do_luma_filter
-++
-++  vadds H(13,0)+r3, HX(P2,0), 0
-++  vadds H(14,0)+r3, HX(P1,0), 0
-++  vadds H(15,0)+r3, HX(P0,0), 0
-++  # P3 and Q3 never change so don't bother saving back
-++  vadds H(16,0)+r3, HX(Q0,0), 0
-++  vadds H(17,0)+r3, HX(Q1,0), 0
-++  vadds H(18,0)+r3, HX(Q2,0), 0
-++
-++  pop pc
-++
-++# r4 points to array of beta/tc for each 4 length edge
-++do_luma_filter:
-++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
-++  valtl HX(beta,0),H(setup,0),H(setup,0)
-++  valtu HX(tc,0),H(setup,0),H(setup,0)
-++  vmul HX(tc25,0), HX(tc,0), 5
-++  vadd HX(tc25,0),HX(tc25,0), 1
-++  vasr HX(tc25,0), HX(tc25,0), 1
-++
-++  # Compute decision
-++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
-++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
-++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
-++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
-++
-++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
-++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
-++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
-++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
-++
-++  vadd HX(d,0), HX(dp,0), HX(dq,0)
-++  vasr HX(beta2,0),HX(beta,0),2
-++  vasr HX(beta3,0),HX(beta,0),3
-++
-++  # Compute flags that are negative if all conditions pass
-++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
-++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
-++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
-++
-++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
-++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
-++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
-++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
-++  vmov HX(decision,0), 1 IFNN
-++  vadd H(decision,0),H(decision,3),0 IFN
-++  vadd H(decision,16),H(decision,19),0 IFN
-++  vmov -,HX(decision,0) SETF   # N marks strong filter
-++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
-++
-++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
-++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
-++  vmov HX(decision,0),0 IFNN # Z marks no filter
-++
-++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
-++  # First extract out even terms
-++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
-++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
-++  # Now expand back
-++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
-++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
-++
-++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
-++
-++  # Do a quick check to see if there is anything to do
-++  mov r11, 0 # Signal no filtering
-++  vmov -,1 IFNZ SUMS r5
-++  cmp r5,0
-++  beq filtering_done
-++  mov r11, 1 # Signal some filtering
-++  # And whether there is any strong filtering
-++  vmov -,1 IFN SUMS r5
-++  cmp r5,0
-++  beq normal_filtering
-++
-++  ##############################################################################
-++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
-++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
-++
-++  # Take a copy of the original pixels for use in decision calculation
-++  vmov HX(P0,32),HX(P0,0)
-++  vmov HX(Q0,32),HX(Q0,0)
-++  vmov HX(P1,32),HX(P1,0)
-++  vmov HX(Q1,32),HX(Q1,0)
-++  vmov HX(P2,32),HX(P2,0)
-++  vmov HX(Q2,32),HX(Q2,0)
-++
-++  vadd -,HX(P2,32),4 CLRA SACC
-++  vshl -,HX(P1,32),1 SACC
-++  vshl -,HX(P0,32),1 SACC
-++  vshl -,HX(Q0,32),1 SACC
-++  vshl HX(delta,0),HX(Q1,32),0 SACC
-++  vasr HX(delta,0),HX(delta,0), 3
-++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
-++
-++  vadd -,HX(P2,32),2 CLRA SACC
-++  vadd -,HX(P1,32),HX(P0,32) SACC
-++  vshl HX(delta,0),HX(Q0,32),0 SACC
-++  vasr HX(delta,0),HX(delta,0), 2
-++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
-++
-++  vadd -,HX(Q0,32),4 CLRA SACC
-++  vadd -,HX(P1,32),HX(P0,32) SACC
-++  vmul -,HX(P2,32),3 SACC
-++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
-++  vasr HX(delta,0),HX(delta,0), 3
-++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
-++  #vmov HX(P2,0),3 IFN
-++
-++  # Now reverse all P/Qs
-++
-++  vadd -,HX(Q2,32),4 CLRA SACC
-++  vshl -,HX(Q1,32),1 SACC
-++  vshl -,HX(Q0,32),1 SACC
-++  vshl -,HX(P0,32),1 SACC
-++  vshl HX(delta,0),HX(P1,32),0 SACC
-++  vasr HX(delta,0),HX(delta,0), 3
-++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
-++
-++  vadd -,HX(Q2,32),2 CLRA SACC
-++  vadd -,HX(Q1,32),HX(Q0,32) SACC
-++  vshl HX(delta,0),HX(P0,32),0 SACC
-++  vasr HX(delta,0),HX(delta,0), 2
-++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
-++
-++  vadd -,HX(P0,32),4 CLRA SACC
-++  vadd -,HX(Q1,32),HX(Q0,32) SACC
-++  vmul -,HX(Q2,32),3 SACC
-++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
-++  vasr HX(delta,0),HX(delta,0), 3
-++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
-++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
-++
-++  ##############################################################################
-++  # Normal filtering
-++normal_filtering:
-++  # Invert the decision flags
-++  # make instruction more complicated as assembler has error and loses SETF
-++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
-++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
-++
-++  vmov -,1 IFN SUMS r5
-++  cmp r5,0
-++  beq filtering_done
-++
-++  vasr HX(tc2,0), HX(tc,0), 1
-++  vmul HX(tc10,0), HX(tc,0), 10
-++
-++  vasr HX(thresh,0), HX(beta,0), 1
-++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
-++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
-++
-++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
-++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
-++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
-++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
-++  # Expand ptest and qtest together
-++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
-++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
-++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
-++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
-++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
-++
-++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
-++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
-++  vmov -,8 CLRA SACC
-++  vmul -,HX(delta0,0), 9 SACC
-++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
-++  vasr HX(delta0,0), HX(delta0,0), 4
-++  vdist HX(deltatest,0), HX(delta0,0), 0
-++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
-++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
-++
-++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
-++
-++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
-++  vadd HX(deltap1,0), HX(deltap1,0), 1
-++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
-++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
-++  vasr HX(deltap1,0), HX(deltap1,0), 1
-++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
-++
-++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
-++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
-++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
-++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
-++  vrsub -, HX(delta0,0), 0 SACC
-++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
-++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
-++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
-++
-++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
-++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
-++
-++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
-++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
-++
-++  vmov -,HX(deltatest,0) SETF
-++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
-++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
-++
-++  #vmov HX(P2,0),1 IFN
-++
-++filtering_done:
-++  b lr
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 0121fca..05b2169 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -147,7 +147,7 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   vcsm_init();
-+   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+-  memset(ptr, 0, sizeof *ptr);
-++  memset((void*)ptr, 0, sizeof *ptr);
-+   vc = gpu_mem_ptr.vc;
-+ 
-+   ptr->mb = mb;
-+@@ -254,7 +254,7 @@ void gpu_cache_flush(GPU_MEM_PTR_T *p)
-+     struct vcsm_user_clean_invalid_s iocache = {};
-+     iocache.s[0].handle = p->vcsm_handle;
-+     iocache.s[0].cmd = 3; // clean+invalidate
-+-    iocache.s[0].addr = p->arm;
-++    iocache.s[0].addr = (int) p->arm;
-+     iocache.s[0].size  = p->numbytes;
-+     vcsm_clean_invalid( &iocache );
-+ #else
-+@@ -390,6 +390,7 @@ static void *vpu_start(void *arg) {
-+ #ifdef RPI_TIME_TOTAL_POSTED
-+   int last_time=0;
-+   long long on_time=0;
-++  long long on_time_deblock=0;
-+   long long off_time=0;
-+   int start_time;
-+   int end_time;
-+@@ -451,10 +452,13 @@ static void *vpu_start(void *arg) {
-+ #ifdef RPI_TIME_TOTAL_POSTED
-+     end_time = Microseconds();
-+     last_time = end_time;
-+-    on_time += end_time - start_time;
-++    if (p[6]==2)
-++      on_time_deblock += end_time - start_time;
-++    else
-++      on_time += end_time - start_time;
-+     count++;
-+     if ((count&0x7f)==0)
-+-      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-++      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
-+ #endif
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-+index e86eb30..c5d8b29 100644
-+--- a/libavcodec/rpi_shader.c
-++++ b/libavcodec/rpi_shader.c
-+@@ -61,7 +61,7 @@ unsigned int rpi_shader[] = {
-+ /* [0x00000120] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+ /* [0x00000128] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-+ /* [0x00000130] */ 0x00000008, 0xe00208a7, // mov r2,8
-+-/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif, r2
-++/* [0x00000138] */ 0x11827c80, 0x10021327, // shl rb12,unif,r2
-+ /* [0x00000140] */ 0x0c827c80, 0x10021367, // add rb13,unif,r2
-+ /* [0x00000148] */ 0x15827d80, 0x100208a7, // mov r2, unif
-+ /* [0x00000150] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+-- 
-+2.5.0
-+
-+
-+From 8864ce029b80325be328e0b2493f5ba18b10c906 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 1 Jul 2015 09:21:17 +0100
-+Subject: [PATCH 65/68] Added ability to combine jobs
-+
-+---
-+ libavcodec/rpi_qpu.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++-
-+ 1 file changed, 80 insertions(+), 1 deletion(-)
-+
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 05b2169..91777be 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -8,6 +8,8 @@
-+ #define RPI_TIME_TOTAL_POSTED
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+ #define RPI_ASYNC
-++// Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
-++#define RPI_COMBINE_JOBS
-+ 
-+ #include <stdio.h>
-+ #include <stdlib.h>
-+@@ -398,9 +400,15 @@ static void *vpu_start(void *arg) {
-+ #endif
-+   while(1) {
-+     int i;
-+-    int *p;
-++    int *p; // Pointer for a QPU/VPU job
-++#ifdef RPI_COMBINE_JOBS
-++    int *q = NULL; // Pointer for a VPU only job
-++    int have_qpu = 0;
-++    int have_vpu = 0;
-++#endif
-+     int qpu_code;
-+     int qpu_codeb;
-++    int num_jobs; // Number of jobs available
-+     pthread_mutex_lock(&post_mutex);
-+     while( vpu_async_tail - vpu_async_head <= 0)
-+     {
-+@@ -408,13 +416,38 @@ static void *vpu_start(void *arg) {
-+       pthread_cond_wait(&post_cond_tail, &post_mutex);
-+     }
-+     p = vpu_cmds[vpu_async_head%MAXCMDS];
-++    num_jobs = vpu_async_tail - vpu_async_head;
-+     pthread_mutex_unlock(&post_mutex);
-+ 
-+     if (p[6] == -1) {
-+       break; // Last job
-+     }
-++    if (p[7] == 0 && p[0] == 0 && p[16]==0)
-++      goto job_done_early;
-++
-++#ifdef RPI_COMBINE_JOBS
-++    // First scan for a qpu job
-++    for (int x=0;x<num_jobs;x++) {
-++      p = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
-++      if (p[7]) {
-++        have_qpu = 1;
-++        break;
-++      }
-++    }
-++    // Now scan for a non-qpu job
-++    for (int x=0;x<num_jobs;x++) {
-++      q = vpu_cmds[(vpu_async_head+x)%MAXCMDS];
-++      if (!q[7]) {
-++        have_vpu = 1;
-++        break;
-++      }
-++    }
-++    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
-++#endif
-+     qpu_code = p[7];
-+     qpu_codeb = p[16];
-++
-++
-+     //if (p[7]) {
-+         //GPU_MEM_PTR_T *buf = (GPU_MEM_PTR_T *)p[7];
-+         //gpu_cache_flush(buf);
-+@@ -427,6 +460,40 @@ static void *vpu_start(void *arg) {
-+     off_time += start_time-last_time;
-+ #endif
-+ 
-++#ifdef RPI_COMBINE_JOBS
-++    if (have_qpu) {
-++      for(i=0;i<8;i++) {
-++        gpu->mail[i*2] = p[8+i];
-++        gpu->mail[i*2 + 1] = qpu_code;
-++      }
-++      for(i=0;i<12;i++) {
-++        gpu->mail2[i*2] = p[17+i];
-++        gpu->mail2[i*2 + 1] = qpu_codeb;
-++      }
-++      if (have_vpu) {
-++        execute_multi(gpu->mb,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-++                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-++                              q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
-++        q[0] = 0;
-++      } else {
-++        execute_multi(gpu->mb,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-++                              p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-++                              0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-++      }
-++      p[0] = 0;
-++      p[7] = 0;
-++      p[16] = 0;
-++    } else {
-++        av_assert0(have_vpu);
-++        vpu_execute_code(q[0], q[1], q[2], q[3], q[4], q[5], q[6]);
-++        q[0] = 0;
-++    }
-++#else
-++
-+     if (!qpu_code) {
-+       vpu_execute_code(p[0], p[1], p[2], p[3], p[4], p[5], p[6]);
-+     } else {
-+@@ -449,17 +516,29 @@ static void *vpu_start(void *arg) {
-+                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+ #endif
-+     }
-++#endif
-++
-+ #ifdef RPI_TIME_TOTAL_POSTED
-+     end_time = Microseconds();
-+     last_time = end_time;
-++#ifdef RPI_COMBINE_JOBS
-++    // There are three cases we may wish to distinguish of VPU/QPU activity
-++    on_time += end_time - start_time;
-++#else
-+     if (p[6]==2)
-+       on_time_deblock += end_time - start_time;
-+     else
-+       on_time += end_time - start_time;
-++#endif
-+     count++;
-+     if ((count&0x7f)==0)
-++#ifdef RPI_COMBINE_JOBS
-+       printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
-++#else
-++      printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-++#endif
-+ #endif
-++job_done_early:
-+     pthread_mutex_lock(&post_mutex);
-+     vpu_async_head++;
-+     pthread_cond_broadcast(&post_cond_head);
-+-- 
-+2.5.0
-+
-+
-+From 8289de8799cb666404d8d1a01c211a7be17bae61 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 1 Jul 2015 12:53:10 +0100
-+Subject: [PATCH 66/68] Added chroma deblocking
-+
-+---
-+ libavcodec/hevc.c               |  20 ++
-+ libavcodec/hevc.h               |  12 +-
-+ libavcodec/hevc_filter.c        |  92 +++++-
-+ libavcodec/rpi_hevc_transform.h | 644 +++++++++++++++++++++++++++++++++++++++-
-+ libavcodec/rpi_hevc_transform.s | 207 +++++++++++++
-+ libavcodec/rpi_qpu.c            |  27 +-
-+ libavcodec/rpi_shader.qasm      |  11 +
-+ 7 files changed, 988 insertions(+), 25 deletions(-)
-+
-+diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-+index da4bebb..d56f777 100644
-+--- a/libavcodec/hevc.c
-++++ b/libavcodec/hevc.c
-+@@ -249,6 +249,14 @@ static void pic_arrays_free(HEVCContext *s)
-+       gpu_free(&s->y_setup_ptr);
-+       s->y_setup_arm = 0;
-+     }
-++    if (s->uv_setup_arm) {
-++      gpu_free(&s->uv_setup_ptr);
-++      s->uv_setup_arm = 0;
-++    }
-++    if (s->vpu_cmds_arm) {
-++      gpu_free(&s->vpu_cmds_ptr);
-++      s->vpu_cmds_arm = 0;
-++    }
-+ #endif
-+     av_freep(&s->sao);
-+     av_freep(&s->deblock);
-+@@ -322,6 +330,18 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-+     s->y_setup_vc = (void*)s->y_setup_ptr.vc;
-+     memset(s->y_setup_arm, 0, s->y_setup_ptr.numbytes);
-+     printf("Setup %d by %d by %d\n",s->setup_width,s->setup_height,sizeof(*s->y_setup_arm));
-++
-++    s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
-++    s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
-++    gpu_malloc_uncached(sizeof(*s->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height, &s->uv_setup_ptr); // TODO make this cached
-++    s->uv_setup_arm = (void*)s->uv_setup_ptr.arm;
-++    s->uv_setup_vc = (void*)s->uv_setup_ptr.vc;
-++    memset(s->uv_setup_arm, 0, s->uv_setup_ptr.numbytes);
-++    printf("Setup uv %d by %d by %d\n",s->uv_setup_width,s->uv_setup_height,sizeof(*s->uv_setup_arm));
-++
-++    gpu_malloc_uncached(sizeof(*s->vpu_cmds_arm) * 3,&s->vpu_cmds_ptr);
-++    s->vpu_cmds_arm = (void*) s->vpu_cmds_ptr.arm;
-++    s->vpu_cmds_vc = s->vpu_cmds_ptr.vc;
-+ #endif
-+ 
-+     s->bs_width  = (width  >> 2) + 1;
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index c933757..6675a4f 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -55,7 +55,7 @@
-+   #define RPI_MAX_JOBS 2
-+   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+   #define RPI_WORKER
-+-
-++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
-+   #define RPI_DEBLOCK_VPU
-+ 
-+ #endif
-+@@ -1006,6 +1006,16 @@ typedef struct HEVCContext {
-+     uint8_t (*y_setup_vc)[2][2][2][4];
-+     int setup_width; // Number of 16x16 blocks across the image
-+     int setup_height; // Number of 16x16 blocks down the image
-++
-++    GPU_MEM_PTR_T uv_setup_ptr;
-++    uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
-++    uint8_t (*uv_setup_vc)[2][2][2][4];
-++    int uv_setup_width;
-++    int uv_setup_height;
-++
-++    GPU_MEM_PTR_T vpu_cmds_ptr;
-++    int (*vpu_cmds_arm)[6]; // r0-r5 for each command
-++    int vpu_cmds_vc;
-+ #endif
-+ 
-+ #endif
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 06371da..6367068 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -656,9 +656,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                                                                    s->frame->linesize[chroma],
-+                                                                    c_tc, no_p, no_q);
-+                         } else
-++#ifdef RPI_DEBLOCK_VPU
-++                        if (s->enable_rpi_deblock) {
-++                            uint8_t (*setup)[2][2][4];
-++                            int xc = x>>s->ps.sps->hshift[chroma];
-++                            int yc = y>>s->ps.sps->vshift[chroma];
-++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
-++                            int a = ((yc>>3) & 1) << 1;
-++                            int b = (xc>>3) & 1;
-++                            setup = s->uv_setup_arm[num16];
-++                            setup[0][b][0][a] = c_tc[0];
-++                            setup[0][b][0][a + 1] = c_tc[1];
-++                        } else
-++#endif
-+                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
-+                                                                  s->frame->linesize[chroma],
-+                                                                  c_tc, no_p, no_q);
-++
-+                     }
-+                 }
-+ 
-+@@ -689,6 +703,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+                                                                    s->frame->linesize[chroma],
-+                                                                    c_tc, no_p, no_q);
-+                         } else
-++#ifdef RPI_DEBLOCK_VPU
-++                        if (s->enable_rpi_deblock) {
-++                            uint8_t (*setup)[2][2][4];
-++                            int xc = x>>s->ps.sps->hshift[chroma];
-++                            int yc = y>>s->ps.sps->vshift[chroma];
-++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
-++                            int a = ((xc>>3) & 1) << 1;
-++                            int b = (yc>>3) & 1;
-++                            setup = s->uv_setup_arm[num16];
-++                            setup[1][b][0][a] = c_tc[0];
-++                            setup[1][b][0][a + 1] = c_tc[1];
-++                        } else
-++#endif
-+                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
-+                                                                  s->frame->linesize[chroma],
-+                                                                  c_tc, no_p, no_q);
-+@@ -1013,33 +1040,56 @@ void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
-+ static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
-+ {
-+   // Flush image, 4 lines above to bottom of ctb stripe
-+-  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 0);
-++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
-+   // TODO flush buffer of beta/tc setup when it becomes cached
-++
-++  // Prepare three commands at once to avoid calling overhead
-++  s->vpu_cmds_arm[0][0] = get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y;
-++  s->vpu_cmds_arm[0][1] = s->frame->linesize[0];
-++  s->vpu_cmds_arm[0][2] = s->setup_width;
-++  s->vpu_cmds_arm[0][3] = (int) ( s->y_setup_vc + s->setup_width * (y>>4) );
-++  s->vpu_cmds_arm[0][4] = ctb_size>>4;
-++  s->vpu_cmds_arm[0][5] = 2;
-++
-++  s->vpu_cmds_arm[1][0] = get_vc_address(s->frame->buf[1]) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
-++  s->vpu_cmds_arm[1][1] = s->frame->linesize[1];
-++  s->vpu_cmds_arm[1][2] = s->uv_setup_width;
-++  s->vpu_cmds_arm[1][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-++  s->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
-++  s->vpu_cmds_arm[1][5] = 3;
-++
-++  s->vpu_cmds_arm[2][0] = get_vc_address(s->frame->buf[2]) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
-++  s->vpu_cmds_arm[2][1] = s->frame->linesize[2];
-++  s->vpu_cmds_arm[2][2] = s->uv_setup_width;
-++  s->vpu_cmds_arm[2][3] = (int) ( s->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-++  s->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
-++  s->vpu_cmds_arm[2][5] = 4;
-++
-+   // Call VPU
-+-  // TODO add this to a separate pipeline of VPU jobs that can be run in parallel and wait for completion
-+-  vpu_wait(vpu_post_code( vpu_get_fn(), get_vc_address(s->frame->buf[0]) + s->frame->linesize[0] * y, s->frame->linesize[0],
-+-                               s->setup_width, (int) ( s->y_setup_vc + s->setup_width * (y>>4) ),
-+-                               ctb_size>>4, 2, 0)); // 2 means to do the deblocking code
-++  vpu_wait(vpu_post_code( vpu_get_fn(), s->vpu_cmds_vc, 3, 0, 0, 0, 5, 0)); // 5 means to do all the commands
-+ }
-+ 
-+-static void rpi_deblock2(HEVCContext *s, int y, int ctb_size)
-+-{
-+-   int y2;
-+-   for(y2=y;y2<y+ctb_size;y2+=16) {
-+-      rpi_deblock(s,y2,16);
-+-   }
-+-}
-+ #endif
-+ 
-+ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+ {
-+     int x_end = x >= s->ps.sps->width  - ctb_size;
-++#ifdef RPI_DEBLOCK_VPU
-++    int done_deblock = 0;
-++#endif
-+     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
-+         deblocking_filter_CTB(s, x, y);
-+ #ifdef RPI_DEBLOCK_VPU
-+     if (s->enable_rpi_deblock && x_end)
-+     {
-+-      rpi_deblock(s, y, ctb_size);
-++      int y_at_end = y >= s->ps.sps->height - ctb_size;
-++      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
-++      int y_start = y&~63;
-++      if (y_at_end) height = s->ps.sps->height - y_start;
-++      if ((((y+ctb_size)&63)==0) || y_at_end) {
-++        done_deblock = 1;
-++        rpi_deblock(s, y_start, height);
-++      }
-+     }
-+ #endif
-+     if (s->ps.sps->sao_enabled) {
-+@@ -1070,11 +1120,25 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
-+         //int newh = y + ctb_size - 4;
-+         //int currh = s->ref->tf.progress->data[0];
-+         //if (((y + ctb_size)&63)==0)
-++#ifdef RPI_DEBLOCK_VPU
-++        if (s->enable_rpi_deblock) {
-++          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-++          if (done_deblock) {
-++            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-++          }
-++        } else {
-++#ifdef RPI_INTER_QPU
-++          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-++#endif
-++          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-++        }
-++#else
-+ #ifdef RPI_INTER_QPU
-+         ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
-+-        // TODO we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-++        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+ #endif
-+         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-++#endif
-+     }
-+ }
-+ 
-+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
-+index b3f155f..4309f1c 100644
-+--- a/libavcodec/rpi_hevc_transform.h
-++++ b/libavcodec/rpi_hevc_transform.h
-+@@ -3,14 +3,32 @@ unsigned char rpi_hevc_transform [] = {
-+ 106,
-+ 0,
-+ 144,
-+-38,
-++47,
-+ 1,
-+ 37,
-+ 106,
-+ 0,
-+ 144,
-+-57,
-++66,
-+ 1,
-++53,
-++106,
-++0,
-++144,
-++192,
-++4,
-++69,
-++106,
-++0,
-++144,
-++192,
-++4,
-++85,
-++106,
-++0,
-++144,
-++220,
-++5,
-+ 169,
-+ 3,
-+ 62,
-+@@ -2427,4 +2445,626 @@ unsigned char rpi_hevc_transform [] = {
-+ 128,
-+ 90,
-+ 0,
-++169,
-++3,
-++14,
-++96,
-++4,
-++31,
-++169,
-++3,
-++30,
-++96,
-++1,
-++31,
-++73,
-++64,
-++52,
-++64,
-++45,
-++64,
-++2,
-++64,
-++10,
-++64,
-++64,
-++198,
-++1,
-++7,
-++8,
-++232,
-++63,
-++0,
-++0,
-++0,
-++6,
-++232,
-++253,
-++255,
-++255,
-++255,
-++0,
-++246,
-++0,
-++0,
-++0,
-++4,
-++215,
-++64,
-++3,
-++96,
-++2,
-++248,
-++0,
-++35,
-++0,
-++0,
-++64,
-++56,
-++0,
-++0,
-++4,
-++248,
-++0,
-++36,
-++0,
-++0,
-++64,
-++56,
-++8,
-++0,
-++0,
-++240,
-++64,
-++0,
-++132,
-++3,
-++30,
-++106,
-++132,
-++24,
-++128,
-++240,
-++0,
-++0,
-++132,
-++3,
-++128,
-++144,
-++143,
-++0,
-++131,
-++98,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++135,
-++0,
-++131,
-++102,
-++0,
-++158,
-++71,
-++0,
-++2,
-++248,
-++0,
-++35,
-++0,
-++0,
-++64,
-++56,
-++0,
-++0,
-++4,
-++248,
-++0,
-++36,
-++0,
-++0,
-++64,
-++56,
-++8,
-++0,
-++0,
-++240,
-++64,
-++0,
-++132,
-++3,
-++30,
-++106,
-++132,
-++24,
-++128,
-++240,
-++0,
-++0,
-++132,
-++3,
-++128,
-++144,
-++112,
-++0,
-++131,
-++98,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++104,
-++0,
-++131,
-++102,
-++0,
-++248,
-++64,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++30,
-++106,
-++134,
-++24,
-++128,
-++248,
-++0,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++144,
-++123,
-++0,
-++188,
-++64,
-++67,
-++232,
-++0,
-++2,
-++0,
-++0,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++112,
-++0,
-++195,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++128,
-++7,
-++192,
-++130,
-++248,
-++0,
-++0,
-++112,
-++192,
-++224,
-++16,
-++195,
-++31,
-++132,
-++248,
-++1,
-++0,
-++112,
-++0,
-++224,
-++16,
-++203,
-++31,
-++3,
-++99,
-++131,
-++71,
-++68,
-++232,
-++32,
-++0,
-++0,
-++0,
-++0,
-++99,
-++2,
-++99,
-++23,
-++102,
-++7,
-++106,
-++127,
-++156,
-++178,
-++255,
-++0,
-++248,
-++64,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++30,
-++106,
-++134,
-++24,
-++128,
-++248,
-++0,
-++0,
-++112,
-++0,
-++192,
-++243,
-++211,
-++31,
-++128,
-++144,
-++72,
-++0,
-++188,
-++64,
-++67,
-++232,
-++0,
-++2,
-++0,
-++0,
-++0,
-++255,
-++64,
-++0,
-++0,
-++20,
-++200,
-++243,
-++0,
-++0,
-++128,
-++144,
-++61,
-++0,
-++195,
-++232,
-++0,
-++2,
-++0,
-++0,
-++12,
-++128,
-++7,
-++192,
-++130,
-++248,
-++0,
-++0,
-++112,
-++192,
-++224,
-++16,
-++195,
-++31,
-++132,
-++248,
-++1,
-++0,
-++112,
-++0,
-++224,
-++16,
-++203,
-++31,
-++25,
-++102,
-++9,
-++106,
-++2,
-++30,
-++41,
-++3,
-++26,
-++87,
-++162,
-++64,
-++64,
-++198,
-++1,
-++23,
-++127,
-++158,
-++95,
-++255,
-++239,
-++3,
-++0,
-++254,
-++128,
-++143,
-++94,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++143,
-++95,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++142,
-++208,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++142,
-++209,
-++0,
-++0,
-++240,
-++12,
-++0,
-++128,
-++144,
-++47,
-++0,
-++8,
-++255,
-++227,
-++23,
-++0,
-++244,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++35,
-++52,
-++0,
-++180,
-++192,
-++51,
-++0,
-++0,
-++111,
-++3,
-++239,
-++3,
-++0,
-++254,
-++128,
-++143,
-++14,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++143,
-++15,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++192,
-++142,
-++16,
-++0,
-++0,
-++240,
-++12,
-++0,
-++0,
-++254,
-++128,
-++142,
-++17,
-++0,
-++0,
-++240,
-++12,
-++0,
-++128,
-++144,
-++13,
-++0,
-++8,
-++255,
-++227,
-++3,
-++0,
-++244,
-++192,
-++51,
-++0,
-++0,
-++8,
-++255,
-++35,
-++4,
-++0,
-++180,
-++192,
-++51,
-++0,
-++0,
-++111,
-++3,
-++32,
-++246,
-++192,
-++11,
-++1,
-++16,
-++32,
-++246,
-++2,
-++140,
-++47,
-++240,
-++32,
-++247,
-++35,
-++141,
-++63,
-++178,
-++64,
-++254,
-++35,
-++141,
-++2,
-++68,
-++192,
-++243,
-++128,
-++11,
-++32,
-++255,
-++35,
-++240,
-++58,
-++226,
-++192,
-++243,
-++188,
-++10,
-++0,
-++254,
-++0,
-++141,
-++4,
-++4,
-++0,
-++240,
-++128,
-++10,
-++88,
-++246,
-++35,
-++141,
-++3,
-++68,
-++240,
-++246,
-++35,
-++141,
-++48,
-++66,
-++0,
-++247,
-++227,
-++143,
-++52,
-++242,
-++32,
-++247,
-++227,
-++142,
-++52,
-++178,
-++90,
-++0,
-++161,
-++3,
-++6,
-++64,
-++23,
-++64,
-++96,
-++8,
-++70,
-++98,
-++97,
-++8,
-++70,
-++98,
-++98,
-++8,
-++70,
-++98,
-++99,
-++8,
-++70,
-++98,
-++100,
-++8,
-++70,
-++98,
-++101,
-++8,
-++70,
-++98,
-++255,
-++159,
-++8,
-++250,
-++23,
-++102,
-++7,
-++106,
-++112,
-++30,
-++33,
-++3,
-+ };
-+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-+index b055208..5543093 100644
-+--- a/libavcodec/rpi_hevc_transform.s
-++++ b/libavcodec/rpi_hevc_transform.s
-+@@ -85,6 +85,13 @@ hevc_trans_16x16:
-+   beq memclear16
-+   cmp r5,2
-+   beq hevc_deblock_16x16
-++  cmp r5,3
-++  beq hevc_uv_deblock_16x16
-++  cmp r5,4
-++  beq hevc_uv_deblock_16x16_with_clear
-++  cmp r5,5
-++  beq hevc_run_command_list
-++
-+   push r6-r15, lr # TODO cut down number of used registers
-+   mov r14,r3 # coeffs32
-+   mov r15,r4 # num32
-+@@ -708,3 +715,203 @@ normal_filtering:
-+ 
-+ filtering_done:
-+   b lr
-++
-++
-++hevc_uv_deblock_16x16:
-++  push r6-r15, lr
-++  mov r14,0
-++  b hevc_uv_start
-++hevc_uv_deblock_16x16_with_clear:
-++  push r6-r15, lr
-++  mov r14,1
-++  b hevc_uv_start
-++
-++hevc_uv_start:
-++  mov r9,r4
-++  mov r4,r3
-++  mov r13,r2
-++  mov r2,r0
-++  mov r10,r0
-++  subscale4 r0,r1
-++  mov r8,63
-++  mov r6,-3
-++  vmov H(zeros,0),0
-++# r7 is number of blocks still to load
-++# r0 is location of current block - 4 * stride
-++# r1 is stride
-++# r2 is location of current block
-++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-++# r4 is setup
-++# r5 is for temporary calculations
-++# r8 holds 63
-++# r6 holds -3
-++# r9 holds the number of 16 high rows to process
-++# r10 holds the original img base
-++# r11 returns 0 if no filtering was done on the edge
-++# r12 saves a copy of this
-++# r13 is copy of width
-++# r14 is 1 if we should clear the old contents, or 0 if not
-++
-++uv_process_row:
-++  # First iteration does not do horizontal filtering on previous
-++  mov r7, r13
-++  mov r3,0
-++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-++  vldb H(16++,16)+r3,(r2 += r1) REP 16
-++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-++  cmp r14,1
-++  bne uv_skip0
-++  vstb H(zeros,0),(r4)
-++uv_skip0:
-++  bl uv_vert_filter
-++  add r3,8
-++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-++  bl uv_vert_filter
-++  sub r3,8
-++  b uv_start_deblock_loop
-++uv_deblock_loop:
-++  # Middle iterations do vertical on current block and horizontal on preceding
-++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-++  vldb H(16++,16)+r3,(r2 += r1) REP 16
-++  vldb H(setup_input,0), (r4)
-++  cmp r14,1
-++  bne uv_skip1
-++  vstb H(zeros,0),(r4)
-++uv_skip1:
-++  bl uv_vert_filter
-++  add r3,8
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl uv_vert_filter
-++  sub r3,8
-++  vldb H(setup_input,0), -16(r4)
-++  cmp r14,1
-++  bne uv_skip3
-++  vstb H(zeros,0),-16(r4)
-++uv_skip3:
-++  bl uv_horz_filter
-++  mov r12,r11
-++  add r3,8*64
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl uv_horz_filter
-++  sub r3,8*64
-++  addcmpbeq r12,0,0,uv_skip_save_top
-++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-++uv_skip_save_top:
-++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-++uv_start_deblock_loop:
-++  # move onto next 16x16 (could do this with circular buffer support instead)
-++  add r3,16
-++  and r3,r8
-++  add r4,32
-++  # Perform loop counter operations (may work with an addcmpbgt as well?)
-++  add r0,16
-++  add r2,16
-++  sub r7,1
-++  cmp r7,0 # Are there still more blocks to load
-++  bgt uv_deblock_loop
-++
-++  # Final iteration needs to just do horizontal filtering
-++  vldb H(setup_input,0), -16(r4)
-++  cmp r14,1
-++  bne uv_skip2
-++  vstb H(zeros,0),-16(r4)
-++uv_skip2:
-++  bl uv_horz_filter
-++  mov r12,r11
-++  add r3,8*64
-++  vadd H(setup_input,0),H(setup_input,8),0
-++  bl uv_horz_filter
-++  sub r3,64*8
-++  addcmpbeq r12,0,0,uv_skip_save_top2
-++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-++uv_skip_save_top2:
-++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-++
-++# Now look to see if we should do another row
-++  sub r9,1
-++  cmp r9,0
-++  bgt uv_start_again
-++  pop r6-r15, pc
-++uv_start_again:
-++  # Need to sort out r0,r2 to point to next row down
-++  addscale16 r10,r1
-++  mov r2,r10
-++  subscale4 r0,r2,r1
-++  b uv_process_row
-++
-++
-++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-++
-++uv_vert_filter:
-++  push lr
-++
-++  vmov HX(P1,0), V(16,14)+r3
-++  vmov HX(P0,0), V(16,15)+r3
-++  vmov HX(Q0,0), V(16,16)+r3
-++  vmov HX(Q1,0), V(16,17)+r3
-++
-++  bl do_chroma_filter
-++
-++  vadds V(16,15)+r3, HX(P0,0), 0
-++  vadds V(16,16)+r3, HX(Q0,0), 0
-++
-++  pop pc
-++
-++# Filter edge at H(16,0)+r3
-++uv_horz_filter:
-++  push lr
-++
-++  vmov HX(P1,0), H(14,0)+r3
-++  vmov HX(P0,0), H(15,0)+r3
-++  vmov HX(Q0,0), H(16,0)+r3
-++  vmov HX(Q1,0), H(17,0)+r3
-++
-++  bl do_chroma_filter
-++
-++  vadds H(15,0)+r3, HX(P0,0), 0
-++  # P3 and Q3 never change so don't bother saving back
-++  vadds H(16,0)+r3, HX(Q0,0), 0
-++
-++  pop pc
-++
-++# r4 points to array of beta/tc for each 4 length edge
-++do_chroma_filter:
-++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
-++  valtl HX(tc,0),H(setup,0),H(setup,0)
-++
-++  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
-++  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
-++  vsub -,HX(P1,0),HX(Q1,0) SACC
-++  vmov HX(delta,0),4 SACC
-++  vasr HX(delta,0),HX(delta,0),3
-++  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
-++  vadd HX(P0,0),HX(P0,0),HX(delta,0)
-++  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
-++  b lr
-++
-++# r0 = list
-++# r1 = number
-++hevc_run_command_list:
-++  push r6-r7, lr
-++  mov r6, r0
-++  mov r7, r1
-++loop_cmds:
-++  ld r0,(r6) # How to encode r6++?
-++  add r6,4
-++  ld r1,(r6)
-++  add r6,4
-++  ld r2,(r6)
-++  add r6,4
-++  ld r3,(r6)
-++  add r6,4
-++  ld r4,(r6)
-++  add r6,4
-++  ld r5,(r6)
-++  add r6,4
-++  bl hevc_trans_16x16
-++  sub r7,1
-++  cmp r7,0
-++  bgt loop_cmds
-++
-++  pop r6-r7, pc
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 91777be..5aa0432 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -397,6 +397,8 @@ static void *vpu_start(void *arg) {
-+   int start_time;
-+   int end_time;
-+   int count=0;
-++  int count_deblock=0;
-++  int count_qpu=0;
-+ #endif
-+   while(1) {
-+     int i;
-+@@ -442,7 +444,7 @@ static void *vpu_start(void *arg) {
-+         break;
-+       }
-+     }
-+-    printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
-++    //printf("Have_qpu = %d, have_vpu=%d\n",have_qpu,have_vpu);
-+ #endif
-+     qpu_code = p[7];
-+     qpu_codeb = p[16];
-+@@ -460,6 +462,12 @@ static void *vpu_start(void *arg) {
-+     off_time += start_time-last_time;
-+ #endif
-+ 
-++#define NO_FLUSH 1
-++#define CLEAR_PROFILE 2
-++#define OUTPUT_COUNTS 4
-++
-++#define FLAGS_FOR_PROFILING (NO_FLUSH)
-++
-+ #ifdef RPI_COMBINE_JOBS
-+     if (have_qpu) {
-+       for(i=0;i<8;i++) {
-+@@ -472,14 +480,14 @@ static void *vpu_start(void *arg) {
-+       }
-+       if (have_vpu) {
-+         execute_multi(gpu->mb,
-+-                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
-+                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                               q[0], q[1], q[2], q[3], q[4], q[5], q[6]); // VPU1
-+         q[0] = 0;
-+       } else {
-+         execute_multi(gpu->mb,
-+-                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING, 5000,
-+                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+@@ -510,7 +518,7 @@ static void *vpu_start(void *arg) {
-+       execute_qpu(gpu->mb,8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */);
-+ #else
-+       execute_multi(gpu->mb,
-+-                              12,gpu->vc + offsetof(struct GPU, mail2), 1, 5000,
-++                              12,gpu->vc + offsetof(struct GPU, mail2), FLAGS_FOR_PROFILING , 5000,
-+                               8,gpu->vc + offsetof(struct GPU, mail), 1 /* no flush */, 5000 /* timeout ms */,
-+                               p[0], p[1], p[2], p[3], p[4], p[5], p[6], // VPU0
-+                               0,    0   , 0   , 0   , 0   , 0   , 0); // VPU1
-+@@ -525,17 +533,20 @@ static void *vpu_start(void *arg) {
-+     // There are three cases we may wish to distinguish of VPU/QPU activity
-+     on_time += end_time - start_time;
-+ #else
-+-    if (p[6]==2)
-++    if (p[6]>1) {
-++      count_deblock++;
-+       on_time_deblock += end_time - start_time;
-+-    else
-++    } else {
-+       on_time += end_time - start_time;
-++      count_qpu++;
-++    }
-+ #endif
-+     count++;
-+     if ((count&0x7f)==0)
-+ #ifdef RPI_COMBINE_JOBS
-+-      printf("Posted %d On=%dms, On_deblock=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(on_time_deblock/1000),(int)(off_time/1000));
-+-#else
-+       printf("Posted %d On=%dms, Off=%dms\n",count,(int)(on_time/1000),(int)(off_time/1000));
-++#else
-++      printf("Posted %d On=%dms (%d calls), On_deblock=%dms (%d calls), Off=%dms\n",count,(int)(on_time/1000),count_qpu,(int)(on_time_deblock/1000),count_deblock,(int)(off_time/1000));
-+ #endif
-+ #endif
-+ job_done_early:
-+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-+index 0686249..64bf5b0 100644
-+--- a/libavcodec/rpi_shader.qasm
-++++ b/libavcodec/rpi_shader.qasm
-+@@ -1077,6 +1077,17 @@ nop        ; nop # delay slot 2
-+ ::mc_interrupt_exit12
-+   mov  -, vw_wait # wait on the VDW
-+ 
-++  # Dummy wait to test instructions
-++#  mov r3,1000000
-++#:dummy_loop
-++#  sub.setf r3, r3, 1
-++#  nop
-++#  nop
-++#  brr.anynn -, r:dummy_loop
-++#  nop
-++#  nop
-++#  nop
-++
-+   ldtmu0
-+   ldtmu0
-+   ldtmu1
-+-- 
-+2.5.0
-+
-+
-+From f7a8b294a317dc015d19a342aa20a994a299676d Mon Sep 17 00:00:00 2001
-+From: Ben Avison <bavison@riscosopen.org>
-+Date: Tue, 23 Jun 2015 23:42:03 +0100
-+Subject: [PATCH 67/68] armv7/hevc: Optimise deblocking boundary strength
-+ calculation
-+
-+---
-+ libavcodec/arm/hevcdsp_deblock_neon.S | 115 +++++++++++++++++
-+ libavcodec/arm/hevcdsp_init_neon.c    |   9 ++
-+ libavcodec/hevc.h                     |  11 --
-+ libavcodec/hevc_filter.c              | 224 ++++++++++++++--------------------
-+ libavcodec/hevcdsp.c                  | 116 ++++++++++++++++++
-+ libavcodec/hevcdsp.h                  |  14 +++
-+ 6 files changed, 344 insertions(+), 145 deletions(-)
-+
-+diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-+index 166bddb..bad4589 100644
-+--- a/libavcodec/arm/hevcdsp_deblock_neon.S
-++++ b/libavcodec/arm/hevcdsp_deblock_neon.S
-+@@ -383,3 +383,118 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
-+         vst1.8   {d4}, [r0]
-+         bx       lr
-+ endfunc
-++
-++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-++ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-++ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
-++ */
-++function ff_hevc_deblocking_boundary_strengths_neon, export=1
-++        add         ip, sp, #4*4
-++        push        {a2-a4,v1-v8,lr}
-++        ldmia       ip, {v5-v7}
-++1:      ldmdb       ip, {v1-v4}
-++        ldrsb       a3, [v5, #8]    @ curr->ref_idx
-++        ldrsb       v8, [v5, #9]
-++        ldrsb       ip, [v6, #8]    @ neigh->ref_idx
-++        ldrsb       lr, [v6, #9]
-++        ldr         v1, [v1, a3, lsl #2]
-++        ldrb        a3, [v5, #10]   @ curr->pred_flag
-++        ldr         v2, [v2, v8, lsl #2]
-++        ldrb        v8, [v6, #10]   @ neigh->pred_flag
-++        ldr         v3, [v3, ip, lsl #2]
-++        ldr         v4, [v4, lr, lsl #2]
-++        teq         a3, #3
-++        beq         20f
-++        teq         v8, #3
-++        beq         90f
-++
-++        tst         a3, #1
-++        ldrne       a3, [v5, #0]    @ curr->mv[0]
-++        ldreq       a3, [v5, #4]    @ curr->mv[1]
-++        moveq       v1, v2
-++        tst         v8, #1
-++        ldrne       v8, [v6, #0]    @ neigh->mv[0]
-++        ldreq       v8, [v6, #4]    @ neigh->mv[1]
-++        moveq       v3, v4
-++        teq         v1, v3
-++        bne         10f
-++        ldr         lr, =0xFFFCFFFC
-++        ssub16      ip, v8, a3
-++        ssub16      a3, a3, v8
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        @ drop through
-++10:     movne       a3, #1
-++11:     subs        a2, a2, #1
-++12:     strbhs      a3, [v7], a4
-++        subs        a2, a2, #1
-++        bhs         12b
-++
-++        ldm         sp, {a2, a3}
-++        add         ip, sp, #16*4
-++        subs        a1, a1, #1
-++        add         v5, v5, a3
-++        add         v6, v6, a3
-++        bhi         1b
-++        pop         {a2-a4,v1-v8,pc}
-++
-++20:     teq         v8, #3
-++        bne         10b
-++
-++        teq         v1, v3
-++        teqeq       v2, v4
-++        bne         40f
-++        teq         v1, v2
-++        bne         30f
-++
-++        ldrd        v1, v2, [v5]    @ curr->mv
-++        ldrd        v3, v4, [v6]    @ neigh->mv
-++        ldr         lr, =0xFFFCFFFC
-++        ssub16      ip, v3, v1
-++        ssub16      a3, v1, v3
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        bne         25f
-++        ssub16      ip, v4, v2
-++        ssub16      a3, v2, v4
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        beq         11b
-++        @ drop through
-++25:     ssub16      ip, v4, v1
-++        ssub16      a3, v1, v4
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        bne         10b
-++        ssub16      ip, v3, v2
-++        ssub16      a3, v2, v3
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        b           10b
-++
-++30:     ldrd        v1, v2, [v5]    @ curr->mv
-++        ldrd        v3, v4, [v6]    @ neigh->mv
-++        ldr         lr, =0xFFFCFFFC
-++        ssub16      ip, v3, v1
-++        ssub16      a3, v1, v3
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        bne         10b
-++        ssub16      ip, v4, v2
-++        ssub16      a3, v2, v4
-++        sel         a3, a3, ip
-++        ands        a3, a3, lr
-++        b           10b
-++
-++40:     teq         v1, v4
-++        teqeq       v2, v3
-++        bne         10b
-++
-++        ldrd        v1, v2, [v5]    @ curr->mv
-++        ldrd        v3, v4, [v6]    @ neigh->mv
-++        ldr         lr, =0xFFFCFFFC
-++        b           25b
-++
-++90:     mov         a3, #1
-++        b           11b
-++endfunc
-+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-+index e5da7e9..49c70dd 100644
-+--- a/libavcodec/arm/hevcdsp_init_neon.c
-++++ b/libavcodec/arm/hevcdsp_init_neon.c
-+@@ -290,6 +290,10 @@ static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t
-+ }
-+ #undef CMP
-+ 
-++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-++                                                MvField *curr, MvField *neigh, uint8_t *bs);
-++
-+ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+ {
-+     if (bit_depth == 8) {
-+@@ -387,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-+         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
-+         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
-+     }
-++
-++    assert(offsetof(MvField, mv) == 0);
-++    assert(offsetof(MvField, ref_idx) == 8);
-++    assert(offsetof(MvField, pred_flag) == 10);
-++    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
-+ }
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 6675a4f..10fbccc 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -683,17 +683,6 @@ typedef struct CodingUnit {
-+     uint8_t cu_transquant_bypass_flag;
-+ } CodingUnit;
-+ 
-+-typedef struct Mv {
-+-    int16_t x;  ///< horizontal component of motion vector
-+-    int16_t y;  ///< vertical component of motion vector
-+-} Mv;
-+-
-+-typedef struct MvField {
-+-    DECLARE_ALIGNED(4, Mv, mv)[2];
-+-    int8_t ref_idx[2];
-+-    int8_t pred_flag;
-+-} MvField;
-+-
-+ typedef struct NeighbourAvailable {
-+     int cand_bottom_left;
-+     int cand_left;
-+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-+index 6367068..826a82f 100644
-+--- a/libavcodec/hevc_filter.c
-++++ b/libavcodec/hevc_filter.c
-+@@ -726,69 +726,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-+     }
-+ }
-+ 
-+-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
-+-                             RefPicList *neigh_refPicList)
-+-{
-+-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-+-        // same L0 and L1
-+-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
-+-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
-+-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
-+-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-+-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
-+-                return 1;
-+-            else
-+-                return 0;
-+-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-+-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-+-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
-+-                return 1;
-+-            else
-+-                return 0;
-+-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
-+-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
-+-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
-+-                return 1;
-+-            else
-+-                return 0;
-+-        } else {
-+-            return 1;
-+-        }
-+-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+-        Mv A, B;
-+-        int ref_A, ref_B;
-+-
-+-        if (curr->pred_flag & 1) {
-+-            A     = curr->mv[0];
-+-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
-+-        } else {
-+-            A     = curr->mv[1];
-+-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
-+-        }
-+-
-+-        if (neigh->pred_flag & 1) {
-+-            B     = neigh->mv[0];
-+-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
-+-        } else {
-+-            B     = neigh->mv[1];
-+-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
-+-        }
-+-
-+-        if (ref_A == ref_B) {
-+-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
-+-                return 1;
-+-            else
-+-                return 0;
-+-        } else
-+-            return 1;
-+-    }
-+-
-+-    return 1;
-+-}
-+ 
-+ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+                                            int log2_trafo_size)
-+@@ -799,10 +736,17 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
-+     int min_pu_width     = s->ps.sps->min_pu_width;
-+     int min_tu_width     = s->ps.sps->min_tb_width;
-+-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
-+-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
-+     int boundary_upper, boundary_left;
-+-    int i, j, bs;
-++    int i, j;
-++    RefPicList *rpl      = s->ref->refPicList;
-++    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
-++    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
-++    int y_pu             = y0 >> log2_min_pu_size;
-++    int x_pu             = x0 >> log2_min_pu_size;
-++    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
-++    int is_intra         = curr->pred_flag == PF_INTRA;
-++    int inc              = log2_min_pu_size == 2 ? 2 : 1;
-++    uint8_t *bs;
-+ 
-+ #ifdef DISABLE_STRENGTHS
-+     return;
-+@@ -818,34 +762,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
-+         boundary_upper = 0;
-+ 
-++    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
-++
-+     if (boundary_upper) {
-+         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
-+                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
-+-                              s->ref->refPicList;
-+-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
-+-        int yq_pu =  y0      >> log2_min_pu_size;
-+-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
-+-        int yq_tu =  y0      >> log2_min_tu_size;
-++                              rpl;
-++        MvField *top = curr - min_pu_width;
-++
-++        if (is_intra) {
-++            for (i = 0; i < (1 << log2_trafo_size); i += 4)
-++                bs[i >> 2] = 2;
-++
-++        } else {
-++            int y_tu = y0 >> log2_min_tu_size;
-++            int x_tu = x0 >> log2_min_tu_size;
-++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
-++            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
-++
-++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
-++                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
-++                    curr, top, bs);
-+ 
-+             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-+-                int x_pu = (x0 + i) >> log2_min_pu_size;
-+-                int x_tu = (x0 + i) >> log2_min_tu_size;
-+-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-+-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-+-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
-+-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
-+-
-+-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
-+-                    bs = 2;
-+-                else if (curr_cbf_luma || top_cbf_luma)
-+-                    bs = 1;
-+-                else
-+-                    bs = boundary_strength(s, curr, top, rpl_top);
-+-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
-++                int i_pu = i >> log2_min_pu_size;
-++                int i_tu = i >> log2_min_tu_size;
-++
-++                if (top[i_pu].pred_flag == PF_INTRA)
-++                    bs[i >> 2] = 2;
-++                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
-++                    bs[i >> 2] = 1;
-+             }
-++        }
-++    }
-++
-++    if (!is_intra) {
-++        for (j = inc; j < trafo_in_min_pus; j += inc) {
-++            MvField *top;
-++
-++            curr += min_pu_width * inc;
-++            top = curr - min_pu_width;
-++            bs += s->bs_width * inc << log2_min_pu_size >> 2;
-++
-++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
-++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
-++                    curr, top, bs);
-++        }
-+     }
-+ 
-+-    // bs for vertical TU boundaries
-+     boundary_left = x0 > 0 && !(x0 & 7);
-+     if (boundary_left &&
-+         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-+@@ -856,64 +822,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
-+         boundary_left = 0;
-+ 
-++    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
-++    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
-++
-+     if (boundary_left) {
-+         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
-+                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
-+-                               s->ref->refPicList;
-+-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
-+-        int xq_pu =  x0      >> log2_min_pu_size;
-+-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
-+-        int xq_tu =  x0      >> log2_min_tu_size;
-+-
-+-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-+-                int y_pu      = (y0 + i) >> log2_min_pu_size;
-+-                int y_tu      = (y0 + i) >> log2_min_tu_size;
-+-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-+-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-+-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
-+-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
-+-
-+-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
-+-                    bs = 2;
-+-                else if (curr_cbf_luma || left_cbf_luma)
-+-                    bs = 1;
-+-                else
-+-                    bs = boundary_strength(s, curr, left, rpl_left);
-+-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
-+-            }
-+-    }
-++                               rpl;
-++        MvField *left = curr - 1;
-+ 
-+-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
-+-        RefPicList *rpl = s->ref->refPicList;
-++        if (is_intra) {
-++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
-++                bs[j * s->bs_width >> 2] = 2;
-+ 
-+-        // bs for TU internal horizontal PU boundaries
-+-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
-+-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
-+-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
-+-
-+-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
-+-                int x_pu = (x0 + i) >> log2_min_pu_size;
-+-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
-+-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-+-
-+-                bs = boundary_strength(s, curr, top, rpl);
-+-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
-++        } else {
-++            int y_tu = y0 >> log2_min_tu_size;
-++            int x_tu = x0 >> log2_min_tu_size;
-++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
-++            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
-++
-++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
-++                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
-++                    curr, left, bs);
-++
-++            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
-++                int j_pu = j >> log2_min_pu_size;
-++                int j_tu = j >> log2_min_tu_size;
-++
-++                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
-++                    bs[j * s->bs_width >> 2] = 2;
-++                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
-++                    bs[j * s->bs_width >> 2] = 1;
-+             }
-+         }
-++    }
-+ 
-+-        // bs for TU internal vertical PU boundaries
-+-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
-+-            int y_pu = (y0 + j) >> log2_min_pu_size;
-++    if (!is_intra) {
-++        for (i = inc; i < trafo_in_min_pus; i += inc) {
-++            MvField *left;
-+ 
-+-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
-+-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
-+-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
-+-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
-+-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
-++            curr += inc;
-++            left = curr - 1;
-++            bs += inc << log2_min_pu_size >> 2;
-+ 
-+-                bs = boundary_strength(s, curr, left, rpl);
-+-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
-+-            }
-++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
-++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
-++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
-++                    curr, left, bs);
-+         }
-+     }
-+ }
-+diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-+index 9d773d9..a6534a9 100644
-+--- a/libavcodec/hevcdsp.c
-++++ b/libavcodec/hevcdsp.c
-+@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
-+ #include "hevcdsp_template.c"
-+ #undef BIT_DEPTH
-+ 
-++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
-++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-++                                               MvField *curr, MvField *neigh, uint8_t *bs)
-++{
-++    for (; pus > 0; pus--) {
-++        int strength, out;
-++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
-++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-++        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
-++        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
-++
-++#if 1 // This more directly matches the original implementation
-++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-++            // same L0 and L1
-++            if (curr_refL0 == neigh_refL0 &&
-++                curr_refL0 == curr_refL1 &&
-++                neigh_refL0 == neigh_refL1) {
-++                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-++                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-++                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-++                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
-++                    strength = 1;
-++                else
-++                    strength = 0;
-++            } else if (neigh_refL0 == curr_refL0 &&
-++                       neigh_refL1 == curr_refL1) {
-++                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-++                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
-++                    strength = 1;
-++                else
-++                    strength = 0;
-++            } else if (neigh_refL1 == curr_refL0 &&
-++                       neigh_refL0 == curr_refL1) {
-++                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-++                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
-++                    strength = 1;
-++                else
-++                    strength = 0;
-++            } else {
-++                strength = 1;
-++            }
-++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-++            Mv curr_mv0, neigh_mv0;
-++
-++            if (curr->pred_flag & 1) {
-++                curr_mv0   = curr->mv[0];
-++            } else {
-++                curr_mv0   = curr->mv[1];
-++                curr_refL0 = curr_refL1;
-++            }
-++
-++            if (neigh->pred_flag & 1) {
-++                neigh_mv0   = neigh->mv[0];
-++            } else {
-++                neigh_mv0   = neigh->mv[1];
-++                neigh_refL0 = neigh_refL1;
-++            }
-++
-++            if (curr_refL0 == neigh_refL0) {
-++                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
-++                    strength = 1;
-++                else
-++                    strength = 0;
-++            } else
-++                strength = 1;
-++        } else
-++            strength = 1;
-++#else // This has exactly the same effect, but is more suitable for vectorisation
-++        Mv curr_mv[2];
-++        Mv neigh_mv[2];
-++        memcpy(curr_mv, curr->mv, sizeof curr_mv);
-++        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
-++
-++        if (!(curr->pred_flag & 2)) {
-++            curr_mv[1] = curr_mv[0];
-++            curr_refL1 = curr_refL0;
-++        }
-++        if (!(neigh->pred_flag & 2)) {
-++            neigh_mv[1] = neigh_mv[0];
-++            neigh_refL1 = neigh_refL0;
-++        }
-++        if (!(curr->pred_flag & 1)) {
-++            curr_mv[0] = curr_mv[1];
-++            curr_refL0 = curr_refL1;
-++        }
-++        if (!(neigh->pred_flag & 1)) {
-++            neigh_mv[0] = neigh_mv[1];
-++            neigh_refL0 = neigh_refL1;
-++        }
-++
-++        strength = 1;
-++
-++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-++                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
-++                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
-++
-++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-++                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
-++                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
-++
-++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
-++#endif
-++
-++        curr += in_inc / sizeof (MvField);
-++        neigh += in_inc / sizeof (MvField);
-++
-++        for (out = dup; out > 0; out--)
-++        {
-++            *bs = strength;
-++            bs += out_inc;
-++        }
-++    }
-++}
-++
-+ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-+ {
-+ #undef FUNC
-+@@ -257,6 +371,8 @@ int i = 0;
-+         break;
-+     }
-+ 
-++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
-++
-+     if (ARCH_X86)
-+         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
-+     if (ARCH_ARM)
-+diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-+index 9f1f6dd..e221e54 100644
-+--- a/libavcodec/hevcdsp.h
-++++ b/libavcodec/hevcdsp.h
-+@@ -42,6 +42,17 @@ typedef struct SAOParams {
-+     uint8_t type_idx[3];    ///< sao_type_idx
-+ } SAOParams;
-+ 
-++typedef struct Mv {
-++    int16_t x;  ///< horizontal component of motion vector
-++    int16_t y;  ///< vertical component of motion vector
-++} Mv;
-++
-++typedef struct MvField {
-++    DECLARE_ALIGNED(4, Mv, mv)[2];
-++    int8_t ref_idx[2];
-++    int8_t pred_flag;
-++} MvField;
-++
-+ typedef struct HEVCDSPContext {
-+     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                     struct GetBitContext *gb, int pcm_bit_depth);
-+@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
-+     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                         int32_t *tc, uint8_t *no_p,
-+                                         uint8_t *no_q);
-++    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
-++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-++                                               MvField *curr, MvField *neigh, uint8_t *bs);
-+ } HEVCDSPContext;
-+ 
-+ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-+-- 
-+2.5.0
-+
-+
-+From 95c6d1107c1dc60fd40abeb9eadb69b3937ce9f5 Mon Sep 17 00:00:00 2001
-+From: Peter de Rivaz <peter.derivaz@gmail.com>
-+Date: Wed, 15 Jul 2015 09:09:11 +0100
-+Subject: [PATCH 68/68] Only enable qpu when needed
-+
-+---
-+ libavcodec/hevc.h    |  2 +-
-+ libavcodec/rpi_qpu.c | 21 ++++++++++++++++-----
-+ 2 files changed, 17 insertions(+), 6 deletions(-)
-+
-+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-+index 10fbccc..a8ff7b8 100644
-+--- a/libavcodec/hevc.h
-++++ b/libavcodec/hevc.h
-+@@ -56,7 +56,7 @@
-+   // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+   #define RPI_WORKER
-+   // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
-+-  #define RPI_DEBLOCK_VPU
-++  //#define RPI_DEBLOCK_VPU
-+ 
-+ #endif
-+ 
-+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-+index 5aa0432..ffd13ca 100644
-+--- a/libavcodec/rpi_qpu.c
-++++ b/libavcodec/rpi_qpu.c
-+@@ -9,7 +9,7 @@
-+ // define RPI_ASYNC to run the VPU in a separate thread, need to make a separate call to check for completion
-+ #define RPI_ASYNC
-+ // Define RPI_COMBINE_JOBS to find jobs that can be executed in parallel
-+-#define RPI_COMBINE_JOBS
-++//#define RPI_COMBINE_JOBS
-+ 
-+ #include <stdio.h>
-+ #include <stdlib.h>
-+@@ -143,9 +143,9 @@ static int gpu_init(volatile struct GPU **gpu) {
-+   volatile struct GPU* ptr;
-+ 	if (mb < 0)
-+ 		return -1;
-+-
-++#ifndef RPI_ASYNC
-+ 	if (qpu_enable(mb, 1)) return -2;
-+-
-++#endif
-+   vcsm_init();
-+   gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+   ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+@@ -336,9 +336,9 @@ static void gpu_term(void)
-+     vpu_post_code(0, 0, 0, 0, 0, 0, -1, NULL);
-+     pthread_join(vpu_thread, &res);
-+   }
-+-#endif
-+-
-++#else
-+   qpu_enable(mb, 0);
-++#endif
-+   gpu_free_internal(&gpu_mem_ptr);
-+ 
-+   vcsm_exit();
-+@@ -400,6 +400,7 @@ static void *vpu_start(void *arg) {
-+   int count_deblock=0;
-+   int count_qpu=0;
-+ #endif
-++  int qpu_started = 0;
-+   while(1) {
-+     int i;
-+     int *p; // Pointer for a QPU/VPU job
-+@@ -427,6 +428,12 @@ static void *vpu_start(void *arg) {
-+     if (p[7] == 0 && p[0] == 0 && p[16]==0)
-+       goto job_done_early;
-+ 
-++    if (!qpu_started) {
-++      int result = qpu_enable(gpu->mb, 1);
-++      av_assert0(result==0);
-++      qpu_started = 1;
-++    }
-++
-+ #ifdef RPI_COMBINE_JOBS
-+     // First scan for a qpu job
-+     for (int x=0;x<num_jobs;x++) {
-+@@ -556,6 +563,10 @@ job_done_early:
-+     pthread_mutex_unlock(&post_mutex);
-+   }
-+ 
-++  if (qpu_started) {
-++    qpu_enable(gpu->mb, 0);
-++  }
-++
-+   return NULL;
-+ }
-+ 
-+-- 
-+2.5.0
-+
-
-From aa8268363b74f1b9ed6d6801d379bc08a85eead2 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 14 Dec 2015 12:35:14 +0000
-Subject: [PATCH 63/93] [build] Add patches to ffmpeg for native build
-
----
- tools/depends/target/ffmpeg/autobuild.sh | 8 ++++++++
- 1 file changed, 8 insertions(+)
-
-diff --git a/tools/depends/target/ffmpeg/autobuild.sh b/tools/depends/target/ffmpeg/autobuild.sh
-index b9bfd57..f6d4c3b 100755
---- a/tools/depends/target/ffmpeg/autobuild.sh
-+++ b/tools/depends/target/ffmpeg/autobuild.sh
-@@ -125,6 +125,14 @@ mkdir ffmpeg-${VERSION}
- cd ffmpeg-${VERSION} || exit 2
- tar --strip-components=1 -xf ../${ARCHIVE}
- 
-+patch -p1 < ../ffmpeg_Speed_up_wtv_index_creation.patch
-+patch -p1 < ../0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-+patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
-+patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
-+patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
-+patch -p1 < ../pfcd_hevc_optimisations.patch
-+patch -p1 < ../add_h264_MVC_support.patch
-+
- CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
- ./configure --prefix=$FFMPEG_PREFIX \
- 	--extra-version="kodi-${VERSION}" \
-
-From 31e2cf35741edf914f5413668c158186f9310197 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 12 Jan 2016 16:29:57 +0000
-Subject: [PATCH 64/93] ffmpeg: Add cabac opimisations for hevc
-
----
- .../0001-Squashed-commit-of-the-following.patch    | 2288 ++++++++++++++++++++
- tools/depends/target/ffmpeg/Makefile               |    5 +-
- 2 files changed, 2292 insertions(+), 1 deletion(-)
- create mode 100644 tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch
-
-diff --git a/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch b/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch
-new file mode 100644
-index 0000000..adb584b
---- /dev/null
-+++ b/tools/depends/target/ffmpeg/0001-Squashed-commit-of-the-following.patch
-@@ -0,0 +1,2288 @@
-+From 9421229d7f8e6ef6cfb8a1b30f731f91c6586aca Mon Sep 17 00:00:00 2001
-+From: John Cox <jc@kynesim.co.uk>
-+Date: Wed, 13 Jan 2016 16:13:33 +0000
-+Subject: [PATCH] H.265 residual decode rework (v2)
-+
-+Rework the cabac decode functions
-+Simplify the code flow and variable usage where possible
-+
-+(Remove profiling and other spurious deltas that were in v1)
-+---
-+ libavcodec/arm/cabac.h                |  155 ++++-
-+ libavcodec/arm/hevc_cabac.h           |  491 +++++++++++++++
-+ libavcodec/arm/hevcdsp_deblock_neon.S |   13 +-
-+ libavcodec/arm/hevcdsp_epel_neon.S    |    9 +-
-+ libavcodec/cabac.c                    |   11 +-
-+ libavcodec/cabac.h                    |    9 +-
-+ libavcodec/cabac_functions.h          |   15 +-
-+ libavcodec/hevc_cabac.c               | 1098 +++++++++++++++++++++++++--------
-+ 8 files changed, 1534 insertions(+), 267 deletions(-)
-+ create mode 100644 libavcodec/arm/hevc_cabac.h
-+
-+diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-+index fdbf86b..0a3980a 100644
-+--- a/libavcodec/arm/cabac.h
-++++ b/libavcodec/arm/cabac.h
-+@@ -26,13 +26,34 @@
-+ #include "libavutil/internal.h"
-+ #include "libavcodec/cabac.h"
-+ 
-++
-++#if UNCHECKED_BITSTREAM_READER
-++#define LOAD_16BITS_BEHI\
-++        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
-++        "rev        %[tmp]        , %[tmp]                      \n\t"
-++#elif CONFIG_THUMB
-++#define LOAD_16BITS_BEHI\
-++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
-++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
-++        "it         cs                                          \n\t"\
-++        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
-++        "rev        %[tmp]        , %[tmp]                      \n\t"
-++#else
-++#define LOAD_16BITS_BEHI\
-++        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
-++        "cmp        %[tmp]        , %[ptr]                      \n\t"\
-++        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
-++        "rev        %[tmp]        , %[tmp]                      \n\t"
-++#endif
-++
-++
-+ #define get_cabac_inline get_cabac_inline_arm
-+ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-+                                                  uint8_t *const state)
-+ {
-+     int bit;
-++#if 0
-+     void *reg_b, *reg_c, *tmp;
-+-
-+     __asm__ volatile(
-+         "ldrb       %[bit]        , [%[state]]                  \n\t"
-+         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
-+@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-+           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
-+         : "memory", "cc"
-+         );
-++#else
-++   // *** Not thumb compatible yet
-++   unsigned int reg_b, tmp;
-++    __asm__ (
-++        "ldrb       %[bit]        , [%[state]]                  \n\t"
-++        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-++        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-++        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-++        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
-++// %bit = *state
-++// %range = range
-++// %tmp = RangeLPS
-++        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-++        "ittt       ge                                          \n\t"
-++        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++        "mvnge      %[bit]        , %[bit]                      \n\t"
-++        "movge      %[range]      , %[tmp]                      \n\t"
-++
-++        "clz        %[tmp]        , %[range]                    \n\t"
-++        "sub        %[tmp]        , #23                         \n\t"
-++
-++        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-++        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-++        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++        "strb       %[r_b]        , [%[state]]                  \n\t"
-++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++
-++        "bne        2f                                          \n\t"
-++        LOAD_16BITS_BEHI
-++        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
-++        "movw       %[r_b]        , #0xFFFF                     \n\t"
-++        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-++
-++        "rbit       %[r_b]        , %[low]                      \n\t"
-++        "clz        %[r_b]        , %[r_b]                      \n\t"
-++        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-++#if CONFIG_THUMB
-++        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-++        "add        %[low]        , %[low]      , %[tmp]        \n\t"
-++#else
-++        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-++#endif
-++        "2:                                                     \n\t"
-++        :    [bit]"=&r"(bit),
-++             [low]"+&r"(c->low),
-++           [range]"+&r"(c->range),
-++             [r_b]"=&r"(reg_b),
-++             [ptr]"+&r"(c->bytestream),
-++             [tmp]"=&r"(tmp)
-++          :  [state]"r"(state),
-++            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-++              [byte]"M"(offsetof(CABACContext, bytestream)),
-++#if !UNCHECKED_BITSTREAM_READER
-++                 [c]"r"(c),
-++               [end]"M"(offsetof(CABACContext, bytestream_end)),
-++#endif
-++           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-++        : "memory", "cc"
-++        );
-++#endif
-+ 
-+     return bit & 1;
-+ }
-++
-++#define get_cabac_bypass get_cabac_bypass_arm
-++static inline int get_cabac_bypass_arm(CABACContext * const c)
-++{
-++    int rv = 0;
-++    unsigned int tmp;
-++    __asm (
-++        "lsl        %[low]        , #1                          \n\t"
-++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-++        "adc        %[rv]         , %[rv]       , #0            \n\t"
-++        "it         cs                                          \n\t"
-++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++        "bne        1f                                          \n\t"
-++        LOAD_16BITS_BEHI
-++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
-++        "movw       %[tmp]        , #0xFFFF                     \n\t"
-++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
-++        "1:                                                     \n\t"
-++        : // Outputs
-++              [rv]"+&r"(rv),
-++             [low]"+&r"(c->low),
-++             [tmp]"=&r"(tmp),
-++             [ptr]"+&r"(c->bytestream)
-++        : // Inputs
-++#if !UNCHECKED_BITSTREAM_READER
-++                 [c]"r"(c),
-++               [end]"M"(offsetof(CABACContext, bytestream_end)),
-++#endif
-++             [range]"r"(c->range)
-++        : "cc"
-++    );
-++    return rv;
-++}
-++
-++
-++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
-++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
-++{
-++    unsigned int tmp;
-++    __asm (
-++        "lsl        %[low]        , #1                          \n\t"
-++        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-++        "ite        cc                                          \n\t"
-++        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
-++        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++        "bne        1f                                          \n\t"
-++        LOAD_16BITS_BEHI
-++        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
-++        "movw       %[tmp]        , #0xFFFF                     \n\t"
-++        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
-++        "1:                                                     \n\t"
-++        : // Outputs
-++              [rv]"+&r"(rv),
-++             [low]"+&r"(c->low),
-++             [tmp]"=&r"(tmp),
-++             [ptr]"+&r"(c->bytestream)
-++        : // Inputs
-++#if !UNCHECKED_BITSTREAM_READER
-++                 [c]"r"(c),
-++               [end]"M"(offsetof(CABACContext, bytestream_end)),
-++#endif
-++             [range]"r"(c->range)
-++        : "cc"
-++    );
-++    return rv;
-++}
-++
-+ #endif /* HAVE_ARMV6T2_INLINE */
-+ 
-+ #endif /* AVCODEC_ARM_CABAC_H */
-+diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
-+new file mode 100644
-+index 0000000..31d3c59
-+--- /dev/null
-++++ b/libavcodec/arm/hevc_cabac.h
-+@@ -0,0 +1,491 @@
-++/*
-++ * This file is part of FFmpeg.
-++ *
-++ * FFmpeg is free software; you can redistribute it and/or
-++ * modify it under the terms of the GNU Lesser General Public
-++ * License as published by the Free Software Foundation; either
-++ * version 2.1 of the License, or (at your option) any later version.
-++ *
-++ * FFmpeg is distributed in the hope that it will be useful,
-++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-++ * Lesser General Public License for more details.
-++ *
-++ * You should have received a copy of the GNU Lesser General Public
-++ * License along with FFmpeg; if not, write to the Free Software
-++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-++ */
-++
-++#ifndef AVCODEC_ARM_HEVC_CABAC_H
-++#define AVCODEC_ARM_HEVC_CABAC_H
-++
-++#include "config.h"
-++#if HAVE_ARMV6T2_INLINE
-++
-++#define hevc_mem_bits32 hevc_mem_bits32_arm
-++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
-++{
-++    unsigned int n;
-++    __asm__ (
-++        "rev        %[n], %[x]                     \n\t"
-++        : [n]"=r"(n)
-++        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
-++        :
-++        );
-++    return n << (bits & 7);
-++}
-++
-++
-++// ---------------------------------------------------------------------------
-++//
-++// Helper fns - little bits of code where ARM has an instraction that the
-++// compiler doesn't know about / use
-++
-++#define trans_scale_sat trans_scale_sat_arm
-++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-++{
-++    int rv;
-++    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
-++
-++    __asm__ (
-++    "ssat %[rv], #16, %[t], ASR #1 \n\t"
-++    : [rv]"=r"(rv)
-++    : [t]"r"(t)
-++    :
-++    );
-++    return rv;
-++}
-++
-++#define update_rice update_rice_arm
-++static inline void update_rice_arm(uint8_t * const stat_coeff,
-++    const unsigned int last_coeff_abs_level_remaining,
-++    const unsigned int c_rice_param)
-++{
-++    int t;
-++    __asm__ (
-++    "lsl   %[t], %[coeff], #1               \n\t"
-++    "lsrs  %[t], %[t], %[shift]             \n\t"
-++    "it    eq                               \n\t"
-++    "subeq %[stat], %[stat], #1             \n\t"
-++    "cmp   %[t], #6                         \n\t"
-++    "adc   %[stat], %[stat], #0             \n\t"
-++    "usat  %[stat], #8, %[stat]             \n\t"
-++    : [stat]"+&r"(*stat_coeff),
-++         [t]"=&r"(t)
-++    :  [coeff]"r"(last_coeff_abs_level_remaining),
-++       [shift]"r"(c_rice_param)
-++    : "cc"
-++    );
-++}
-++
-++// ---------------------------------------------------------------------------
-++//
-++// CABAC get loops
-++//
-++// Where the loop is simple enough we can normally do 10-30% better than the
-++// compiler
-++
-++// Get the residual greater than 1 bits
-++
-++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
-++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
-++    uint8_t * const state0)
-++{
-++    unsigned int i, reg_b, st, tmp, bit, rv;
-++     __asm__ (
-++         "mov        %[i]          , #0                          \n\t"
-++         "mov        %[rv]         , #0                          \n\t"
-++         "1:                                                     \n\t"
-++         "add        %[i]          , %[i]        , #1            \n\t"
-++         "cmp        %[rv]         , #0                          \n\t"
-++         "ite        eq                                          \n\t"
-++         "usateq     %[st]         , #2          , %[i]          \n\t"
-++         "movne      %[st]         , #0                          \n\t"
-++
-++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-++         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-++         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
-++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++         "cmp        %[low]        , %[range], lsl #17           \n\t"
-++         "ittt       ge                                          \n\t"
-++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++         "mvnge      %[bit]        , %[bit]                      \n\t"
-++         "movge      %[range]      , %[tmp]                      \n\t"
-++
-++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-++         "and        %[bit]        , %[bit]      , #1            \n\t"
-++         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
-++
-++         "clz        %[tmp]        , %[range]                    \n\t"
-++         "sub        %[tmp]        , #23                         \n\t"
-++
-++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-++// There is a small speed gain from combining both conditions, using a single
-++// branch and then working out what that meant later
-++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++         "it         ne                                          \n\t"
-++         "cmpne      %[n]          , %[i]                        \n\t"
-++         "bne        1b                                          \n\t"
-++
-++// If reload is not required then we must have run out of flags to decode
-++         "tst        %[tmp]        , %[tmp]                      \n\t"
-++         "bne        2f                                          \n\t"
-++
-++// Do reload
-++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-++         "movw       %[r_b]        , #0xFFFF                     \n\t"
-++         "rev        %[tmp]        , %[tmp]                      \n\t"
-++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-++
-++         "rbit       %[r_b]        , %[low]                      \n\t"
-++         "clz        %[r_b]        , %[r_b]                      \n\t"
-++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-++
-++#if CONFIG_THUMB
-++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-++#else
-++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-++#endif
-++
-++         "cmp        %[n]          , %[i]                        \n\t"
-++         "bne        1b                                          \n\t"
-++         "2:                                                     \n\t"
-++         :    [bit]"=&r"(bit),
-++              [low]"+&r"(c->low),
-++            [range]"+&r"(c->range),
-++              [r_b]"=&r"(reg_b),
-++             [bptr]"+&r"(c->bytestream),
-++                [i]"=&r"(i),
-++              [tmp]"=&r"(tmp),
-++               [st]"=&r"(st),
-++               [rv]"=&r"(rv)
-++          :  [state0]"r"(state0),
-++                  [n]"r"(n),
-++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-++               [byte]"M"(offsetof(CABACContext, bytestream)),
-++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-++         : "memory", "cc"
-++    );
-++    return rv;
-++}
-++
-++
-++// n must be > 0 on entry
-++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
-++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
-++    unsigned int n,
-++    const uint8_t const * ctx_map,
-++    uint8_t * p)
-++{
-++    unsigned int reg_b, tmp, st, bit;
-++     __asm__ (
-++         "1:                                                     \n\t"
-++// Get bin from map
-++         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
-++
-++// Load state & ranges
-++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-++         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
-++         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
-++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++         "cmp        %[low]        , %[range], lsl #17           \n\t"
-++         "ittt       ge                                          \n\t"
-++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-++         "mvnge      %[bit]        , %[bit]                      \n\t"
-++         "movge      %[range]      , %[tmp]                      \n\t"
-++
-++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-++         "tst        %[bit]        , #1                          \n\t"
-++// GCC asm seems to need strbne written differently for thumb and arm
-++#if CONFIG_THUMB
-++         "it         ne                                          \n\t"
-++         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
-++#else
-++         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
-++#endif
-++
-++// Renorm
-++         "clz        %[tmp]        , %[range]                    \n\t"
-++         "sub        %[tmp]        , #23                         \n\t"
-++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-++
-++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-++// There is a small speed gain from combining both conditions, using a single
-++// branch and then working out what that meant later
-++         "subs       %[n]          , %[n]        , #1            \n\t"
-++#if CONFIG_THUMB
-++         "itt        ne                                          \n\t"
-++         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
-++         "bne        1b                                          \n\t"
-++#else
-++         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
-++         "bne        1b                                          \n\t"
-++#endif
-++
-++// If we have bits left then n must be 0 so give up now
-++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-++         "bne        2f                                          \n\t"
-++
-++// Do reload
-++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-++         "movw       %[r_b]        , #0xFFFF                     \n\t"
-++         "rev        %[tmp]        , %[tmp]                      \n\t"
-++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-++
-++         "rbit       %[r_b]        , %[low]                      \n\t"
-++         "clz        %[r_b]        , %[r_b]                      \n\t"
-++         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-++
-++#if CONFIG_THUMB
-++         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-++#else
-++         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-++#endif
-++
-++// Check to see if we still have more to do
-++         "cmp        %[n]          , #0                          \n\t"
-++         "bne        1b                                          \n\t"
-++         "2:                                                     \n\t"
-++         :    [bit]"=&r"(bit),
-++              [low]"+&r"(c->low),
-++            [range]"+&r"(c->range),
-++              [r_b]"=&r"(reg_b),
-++             [bptr]"+&r"(c->bytestream),
-++              [idx]"+&r"(p),
-++                [n]"+&r"(n),
-++              [tmp]"=&r"(tmp),
-++               [st]"=&r"(st)
-++          :  [state0]"r"(state0),
-++            [ctx_map]"r"(ctx_map),
-++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-++               [byte]"M"(offsetof(CABACContext, bytestream)),
-++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-++         : "memory", "cc"
-++    );
-++
-++    return p;
-++}
-++
-++// ---------------------------------------------------------------------------
-++//
-++// CABAC_BY22 functions
-++//
-++// By and large these are (at best) no faster than their C equivalents - the
-++// only one worth having is _peek where we do a slightly better job than the
-++// compiler
-++//
-++// The others have been stashed here for reference in case larger scale asm
-++// is attempted in which case they might be a useful base
-++
-++
-++#define get_cabac_by22_peek get_cabac_by22_peek_arm
-++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
-++{
-++    uint32_t rv, tmp;
-++    __asm__ (
-++        "bic      %[rv]  , %[low], #1            \n\t"
-++        "cmp      %[inv] , #0                    \n\t"
-++        "it       ne                             \n\t"
-++        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
-++        :  // Outputs
-++             [rv]"=&r"(rv),
-++             [tmp]"=r"(tmp)
-++        :  // Inputs
-++             [low]"r"(c->low),
-++             [inv]"r"(c->range)
-++        :  // Clobbers
-++                "cc"
-++    );
-++    return rv << 1;
-++}
-++
-++#if 0
-++
-++// ***** Slower than the C  :-(
-++#define get_cabac_by22_flush get_cabac_by22_flush_arm
-++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
-++{
-++    uint32_t m, tmp;
-++    __asm__ (
-++    "add    %[bits], %[bits], %[n]   \n\t"
-++    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
-++
-++    "rsb    %[tmp], %[n], #32        \n\t"
-++    "lsr    %[tmp], %[val], %[tmp]   \n\t"
-++    "mul    %[tmp], %[range], %[tmp] \n\t"
-++
-++    "rev    %[m], %[m]               \n\t"
-++
-++    "lsl    %[tmp], %[tmp], #23      \n\t"
-++    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-++
-++    "and    %[tmp], %[bits], #7         \n\t"
-++    "lsl    %[m], %[m], %[tmp]          \n\t"
-++
-++    "orr    %[low], %[low], %[m], lsr #9      \n\t"
-++        :  // Outputs
-++             [m]"=&r"(m),
-++           [tmp]"=&r"(tmp),
-++          [bits]"+&r"(c->by22.bits),
-++           [low]"+&r"(c->low)
-++        :  // Inputs
-++               [n]"r"(n),
-++             [val]"r"(val),
-++             [inv]"r"(c->range),
-++           [range]"r"(c->by22.range),
-++             [ptr]"r"(c->bytestream)
-++        :  // Clobbers
-++    );
-++}
-++
-++
-++// Works but slower than C
-++#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
-++static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
-++{
-++    uint32_t n, val, tmp, level;
-++
-++//    PROFILE_START();
-++
-++    __asm__ (
-++            // Peek
-++            "bic    %[val],  %[low],   #1  \n\t"
-++            "cmp    %[inv], #0          \n\t"
-++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
-++            "lsl    %[val], %[val], #1  \n\t"
-++
-++            // Count bits (n = prefix)
-++            "mvn    %[n], %[val] \n\t"
-++            "clz    %[n], %[n]   \n\t"
-++
-++            "lsl    %[level], %[val], %[n] \n\t"
-++            "subs   %[tmp], %[n], #3 \n\t"
-++            "blo    2f \n\t"
-++
-++            // prefix >= 3
-++            // < tmp = prefix - 3
-++            // > tmp = prefix + rice - 3
-++            "add    %[tmp], %[tmp], %[rice] \n\t"
-++            // > n = prefix * 2 + rice - 3
-++            "add    %[n], %[tmp], %[n] \n\t"
-++            "cmp    %[n], #21 \n\t"
-++            "bhi    3f \n\t"
-++
-++            "orr    %[level], %[level], #0x80000000 \n\t"
-++            "rsb    %[tmp], %[tmp], #31 \n\t"
-++            "lsr    %[level], %[level], %[tmp] \n\t"
-++
-++            "mov    %[tmp], #2 \n\t"
-++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
-++            "b      1f \n\t"
-++
-++            // > 22 bits used in total - need reload
-++            "3:  \n\t"
-++
-++            // Stash prefix + rice - 3 in level (only spare reg)
-++            "mov    %[level], %[tmp] \n\t"
-++            // Restore n to flush value (prefix)
-++            "sub    %[n], %[n], %[tmp] \n\t"
-++
-++            // Flush + reload
-++
-++//          "rsb    %[tmp], %[n], #32        \n\t"
-++//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
-++//          "mul    %[tmp], %[range], %[tmp] \n\t"
-++
-++            // As it happens we know that all the bits we are flushing are 1
-++            // so we can cheat slightly
-++            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
-++            "lsl    %[tmp], %[tmp], #23      \n\t"
-++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-++
-++            "add    %[bits], %[bits], %[n]   \n\t"
-++            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
-++            "rev    %[n], %[n]               \n\t"
-++            "and    %[tmp], %[bits], #7         \n\t"
-++            "lsl    %[n], %[n], %[tmp]          \n\t"
-++
-++            "orr    %[low], %[low], %[n], lsr #9      \n\t"
-++
-++            // (reload)
-++
-++            "bic    %[val],  %[low],   #1  \n\t"
-++            "cmp    %[inv], #0          \n\t"
-++            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
-++            "lsl    %[val], %[val], #1  \n\t"
-++
-++            // Build value
-++
-++            "mov    %[n], %[level] \n\t"
-++
-++            "orr     %[tmp], %[val], #0x80000000 \n\t"
-++            "rsb     %[level], %[level], #31 \n\t"
-++            "lsr     %[level], %[tmp], %[level] \n\t"
-++
-++            "mov    %[tmp], #2 \n\t"
-++            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
-++            "b      1f \n\t"
-++
-++            // prefix < 3
-++            "2:  \n\t"
-++            "rsb    %[tmp], %[rice], #31 \n\t"
-++            "lsr    %[level], %[level], %[tmp] \n\t"
-++            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
-++            "add    %[n], %[n], %[rice] \n\t"
-++
-++            "1:  \n\t"
-++            // Flush
-++            "add    %[n], %[n], #1 \n\t"
-++
-++            "rsb    %[tmp], %[n], #32        \n\t"
-++            "lsr    %[tmp], %[val], %[tmp]   \n\t"
-++
-++            "add    %[bits], %[bits], %[n]   \n\t"
-++            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
-++
-++            "mul    %[tmp], %[range], %[tmp] \n\t"
-++            "lsl    %[tmp], %[tmp], #23      \n\t"
-++            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-++
-++            "rev    %[val], %[val]               \n\t"
-++            "and    %[tmp], %[bits], #7         \n\t"
-++            "lsl    %[val], %[val], %[tmp]          \n\t"
-++
-++            "orr    %[low], %[low], %[val], lsr #9      \n\t"
-++        :  // Outputs
-++         [level]"=&r"(level),
-++             [n]"=&r"(n),
-++           [val]"=&r"(val),
-++           [tmp]"=&r"(tmp),
-++          [bits]"+&r"(c->by22.bits),
-++           [low]"+&r"(c->low)
-++        :  // Inputs
-++            [rice]"r"(c_rice_param),
-++             [inv]"r"(c->range),
-++           [range]"r"(c->by22.range),
-++             [ptr]"r"(c->bytestream)
-++        :  // Clobbers
-++                "cc"
-++    );
-++
-++//    PROFILE_ACC(residual_abs);
-++
-++    return level;
-++}
-++#endif
-++
-++#endif /* HAVE_ARMV6T2_INLINE */
-++
-++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-+diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-+index bad4589..a088cc3 100644
-+--- a/libavcodec/arm/hevcdsp_deblock_neon.S
-++++ b/libavcodec/arm/hevcdsp_deblock_neon.S
-+@@ -409,10 +409,12 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+         beq         90f
-+ 
-+         tst         a3, #1
-++        itee        ne
-+         ldrne       a3, [v5, #0]    @ curr->mv[0]
-+         ldreq       a3, [v5, #4]    @ curr->mv[1]
-+         moveq       v1, v2
-+         tst         v8, #1
-++        itee        ne
-+         ldrne       v8, [v6, #0]    @ neigh->mv[0]
-+         ldreq       v8, [v6, #4]    @ neigh->mv[1]
-+         moveq       v3, v4
-+@@ -424,9 +426,14 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+         sel         a3, a3, ip
-+         ands        a3, a3, lr
-+         @ drop through
-+-10:     movne       a3, #1
-++10:     it          ne
-++        movne       a3, #1
-+ 11:     subs        a2, a2, #1
-+-12:     strbhs      a3, [v7], a4
-++12:
-++A       strbhs      a3, [v7], a4
-++T       itt         hs
-++T       strbhs      a3, [v7]
-++T       addhs       v7, v7, a4
-+         subs        a2, a2, #1
-+         bhs         12b
-+ 
-+@@ -442,6 +449,7 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+         bne         10b
-+ 
-+         teq         v1, v3
-++        it          eq
-+         teqeq       v2, v4
-+         bne         40f
-+         teq         v1, v2
-+@@ -487,6 +495,7 @@ function ff_hevc_deblocking_boundary_strengths_neon, export=1
-+         b           10b
-+ 
-+ 40:     teq         v1, v4
-++        ite         eq
-+         teqeq       v2, v3
-+         bne         10b
-+ 
-+diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
-+index 516ae5b..00eab9e 100644
-+--- a/libavcodec/arm/hevcdsp_epel_neon.S
-++++ b/libavcodec/arm/hevcdsp_epel_neon.S
-+@@ -110,7 +110,9 @@ function ff_hevc_put_epel_h_neon_8, export=1
-+         sub    r7, #1
-+         lsl    r7, #2
-+         vpush {d8-d15}
-+-        adrl   r12, epel_coeffs
-++@ adr reaches if we are in thumb mode but not in arm
-++T       adr    r12, epel_coeffs
-++A       adrl   r12, epel_coeffs
-+         add    r7, r12
-+         sub       r1, #1
-+         lsl       r4, #1
-+@@ -170,7 +172,8 @@ function ff_hevc_put_epel_v_neon_8, export=1
-+         sub    r7, #1
-+         lsl    r7, #2
-+         vpush {d8-d15}
-+-        adrl   r12, epel_coeffs
-++T       adr    r12, epel_coeffs
-++A       adrl   r12, epel_coeffs
-+         add    r7, r12
-+         load_coeffs_16b r7
-+         sub       r1, r2
-+@@ -246,7 +249,7 @@ function ff_hevc_put_epel_hv_neon_8, export=1
-+         sub    r7, #1
-+         lsl    r7, #2
-+         vpush {d8-d15}
-+-        adrl   r12, epel_coeffs
-++        adr    r12, epel_coeffs
-+         sub    r6, #1
-+         lsl    r6, #2
-+         add    r6, r12 // mx epel coeff offset
-+diff --git a/libavcodec/cabac.c b/libavcodec/cabac.c
-+index f298336..91f5ef5 100644
-+--- a/libavcodec/cabac.c
-++++ b/libavcodec/cabac.c
-+@@ -59,10 +59,19 @@ int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
-+ #if CABAC_BITS == 16
-+     c->low =  (*c->bytestream++)<<18;
-+     c->low+=  (*c->bytestream++)<<10;
-++    // Keep our fetches on a 2-byte boundry as this should avoid ever having to
-++    // do unaligned loads if the compiler (or asm) optimises the double byte
-++    // load into a single instruction
-++    if(((uintptr_t)c->bytestream & 1) == 0) {
-++        c->low += (1 << 9);
-++    }
-++    else {
-++        c->low += ((*c->bytestream++) << 2) + 2;
-++    }
-+ #else
-+     c->low =  (*c->bytestream++)<<10;
-+-#endif
-+     c->low+= ((*c->bytestream++)<<2) + 2;
-++#endif
-+     c->range= 0x1FE;
-+     if ((c->range<<(CABAC_BITS+1)) < c->low)
-+         return AVERROR_INVALIDDATA;
-+diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-+index 857211c..857a1de 100644
-+--- a/libavcodec/cabac.h
-++++ b/libavcodec/cabac.h
-+@@ -48,7 +48,14 @@ extern CABAC_TABLE_CONST uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
-+ typedef struct CABACContext{
-+     int low;
-+     int range;
-+-    int outstanding_count;
-++    union
-++    {
-++        int outstanding_count;
-++        struct {
-++            uint16_t bits;
-++            uint16_t range;
-++        } by22;
-++    };
-+     const uint8_t *bytestream_start;
-+     const uint8_t *bytestream;
-+     const uint8_t *bytestream_end;
-+diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
-+index 2d1d2a6..d3518cb 100644
-+--- a/libavcodec/cabac_functions.h
-++++ b/libavcodec/cabac_functions.h
-+@@ -51,6 +51,7 @@ static CABAC_TABLE_CONST uint8_t * const ff_h264_lps_range = ff_h264_cabac_table
-+ static CABAC_TABLE_CONST uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET;
-+ static CABAC_TABLE_CONST uint8_t * const ff_h264_last_coeff_flag_offset_8x8 = ff_h264_cabac_tables + H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET;
-+ 
-++#if !defined(get_cabac_bypass) || !defined(get_cabac_terminate)
-+ static void refill(CABACContext *c){
-+ #if CABAC_BITS == 16
-+         c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
-+@@ -63,7 +64,9 @@ static void refill(CABACContext *c){
-+ #endif
-+         c->bytestream += CABAC_BITS / 8;
-+ }
-++#endif
-+ 
-++#ifndef get_cabac_terminate
-+ static inline void renorm_cabac_decoder_once(CABACContext *c){
-+     int shift= (uint32_t)(c->range - 0x100)>>31;
-+     c->range<<= shift;
-+@@ -71,14 +74,18 @@ static inline void renorm_cabac_decoder_once(CABACContext *c){
-+     if(!(c->low & CABAC_MASK))
-+         refill(c);
-+ }
-++#endif
-+ 
-+ #ifndef get_cabac_inline
-+ static void refill2(CABACContext *c){
-+     int i;
-+     unsigned x;
-+-
-++#if !HAVE_FAST_CLZ
-+     x= c->low ^ (c->low-1);
-+     i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
-++#else
-++    i = ff_ctz(c->low) - CABAC_BITS;
-++#endif
-+ 
-+     x= -CABAC_MASK;
-+ 
-+@@ -94,7 +101,9 @@ static void refill2(CABACContext *c){
-+ #endif
-+         c->bytestream += CABAC_BITS/8;
-+ }
-++#endif
-+ 
-++#ifndef get_cabac_inline
-+ static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
-+     int s = *state;
-+     int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
-+@@ -166,6 +175,7 @@ static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
-+  *
-+  * @return the number of bytes read or 0 if no end
-+  */
-++#ifndef get_cabac_terminate
-+ static int av_unused get_cabac_terminate(CABACContext *c){
-+     c->range -= 2;
-+     if(c->low < c->range<<(CABAC_BITS+1)){
-+@@ -175,11 +185,13 @@ static int av_unused get_cabac_terminate(CABACContext *c){
-+         return c->bytestream - c->bytestream_start;
-+     }
-+ }
-++#endif
-+ 
-+ /**
-+  * Skip @p n bytes and reset the decoder.
-+  * @return the address of the first skipped byte or NULL if there's less than @p n bytes left
-+  */
-++#ifndef skip_bytes
-+ static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
-+     const uint8_t *ptr = c->bytestream;
-+ 
-+@@ -196,5 +208,6 @@ static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
-+ 
-+     return ptr;
-+ }
-++#endif
-+ 
-+ #endif /* AVCODEC_CABAC_FUNCTIONS_H */
-+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-+index 271e17a..4caf720 100644
-+--- a/libavcodec/hevc_cabac.c
-++++ b/libavcodec/hevc_cabac.c
-+@@ -21,14 +21,72 @@
-+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+  */
-+ 
-++#define UNCHECKED_BITSTREAM_READER 1
-++
-+ #include "libavutil/attributes.h"
-+ #include "libavutil/common.h"
-+ 
-+-#include "cabac_functions.h"
-+ #include "hevc.h"
-++#include "cabac_functions.h"
-++
-++// BY22 is probably faster than simple bypass if the processor has
-++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
-++// x86 has fast int divide
-++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
-++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
-++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
-++// Use native divide if we have a fast one - otherwise use mpy 1/x
-++// x86 has a fast integer divide - arm doesn't - unsure about other
-++// architectures
-++#define USE_BY22_DIV  ARCH_X86
-++
-++// Special case blocks with a single significant ceoff
-++// Decreases the complexity of the code for a common case but increases the
-++// code size.
-++#define USE_N_END_1 1
-++
-++#if ARCH_ARM
-++#include "arm/hevc_cabac.h"
-++#endif
-+ 
-+ #define CABAC_MAX_BIN 31
-+ 
-++
-++#if USE_BY22 && !USE_BY22_DIV
-++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
-++
-++static const uint32_t cabac_by22_inv_range[256] = {
-++                                                    0,      I(257), I(258), I(259),
-++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
-++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
-++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
-++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
-++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
-++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
-++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
-++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
-++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
-++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
-++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
-++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
-++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
-++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
-++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
-++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
-++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
-++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
-++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
-++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
-++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
-++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
-++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
-++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
-++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
-++    I(510), I(511)
-++};
-++#undef I
-++#endif  // USE_BY22
-++
-+ /**
-+  * number of bin by SyntaxElement.
-+  */
-+@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
-+     { 28, 36, 43, 49, 54, 58, 61, 63, },
-+ };
-+ 
-++
-++typedef struct
-++{
-++    uint16_t coeff;
-++    uint16_t scale;
-++} xy_off_t;
-++
-++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
-++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
-++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
-++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
-++
-++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
-++
-++#define OFF_DIAG(t) {\
-++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
-++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
-++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
-++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
-++}
-++
-++#define OFF_HORIZ(t) {\
-++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
-++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
-++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
-++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
-++}
-++
-++#define OFF_VERT(t) {\
-++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
-++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
-++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
-++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
-++}
-++
-++static const xy_off_t off_xys[3][4][16] =
-++{
-++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
-++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
-++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
-++};
-++
-++
-++// Helper fns
-++#ifndef hevc_mem_bits32
-++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
-++{
-++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
-++}
-++#endif
-++
-++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
-++#define hevc_clz32 hevc_clz32_builtin
-++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
-++{
-++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
-++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
-++}
-++#endif
-++
-++// It is unlikely that we will ever need this but include for completeness
-++#ifndef hevc_clz32
-++static inline unsigned int hevc_clz32(unsigned int x)
-++{
-++    unsigned int n = 1;
-++    if ((x & 0xffff0000) == 0) {
-++        n += 16;
-++        x <<= 16;
-++    }
-++    if ((x & 0xff000000) == 0) {
-++        n += 8;
-++        x <<= 8;
-++    }
-++    if ((x & 0xf0000000) == 0) {
-++        n += 4;
-++        x <<= 4;
-++    }
-++    if ((x & 0xc0000000) == 0) {
-++        n += 2;
-++        x <<= 2;
-++    }
-++    return n - ((x >> 31) & 1);
-++}
-++#endif
-++
-++
-++#if !USE_BY22
-++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
-++// will no longer be called but the setup calls will still exist and we want
-++// to null them out
-++#define bypass_start(s)
-++#define bypass_finish(s)
-++#else
-++// Use BY22 for residual bypass block
-++
-++#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
-++#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
-++
-++// BY22 notes that bypass is simply a divide into the bitstream and so we
-++// can peek out large quantities of bits at one and treat the result as if
-++// it was VLC.  In many cases this will lead to O(1) processing rather than
-++// O(n) though the setup and teardown is sufficiently expensive that it is
-++// only worth using if we expect to be dealing with more than a few bits
-++// The definition of "a few bits" will vary from platform to platform but
-++// tests on ARM show that it probably isn't worth it for a single coded
-++// residual, but is for >1 - this is probaly reinforced that if there are
-++// more residuals then they are likely to be bigger and this will make the
-++// O(1) nature of the code more worthwhile.
-++
-++
-++#if !USE_BY22_DIV
-++// * 1/x @ 32 bits gets us 22 bits of accuracy
-++#define CABAC_BY22_PEEK_BITS  22
-++#else
-++// A real 32-bit divide gets us another bit
-++// If we have a 64 bit int & a unit time divider then we should get a lot
-++// of bits (55)  but that is untested and it is unclear if it would give
-++// us a large advantage
-++#define CABAC_BY22_PEEK_BITS  23
-++#endif
-++
-++// Bypass block start
-++// Must be called before _by22_peek is used as it sets the CABAC environment
-++// into the correct state.  _by22_finish must be called to return to 'normal'
-++// (i.e. non-bypass) cabac decoding
-++static inline void get_cabac_by22_start(CABACContext * const c)
-++{
-++    const unsigned int bits = __builtin_ctz(c->low);
-++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
-++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
-++#if !USE_BY22_DIV
-++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
-++#endif
-++
-++    c->bytestream -= (CABAC_BITS / 8);
-++    c->by22.bits = bits;
-++#if !USE_BY22_DIV
-++    c->by22.range = c->range;
-++    c->range = inv;
-++#endif
-++    c->low = x;
-++}
-++
-++// Bypass block finish
-++// Must be called at the end of the bypass block to return to normal operation
-++static inline void get_cabac_by22_finish(CABACContext * const c)
-++{
-++    unsigned int used = c->by22.bits;
-++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
-++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
-++
-++    c->bytestream += bytes_used + (CABAC_BITS / 8);
-++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
-++#if !USE_BY22_DIV
-++    c->range = c->by22.range;
-++#endif
-++}
-++
-++// Peek bypass bits
-++// _by22_start must be called before _by22_peek is called and _by22_flush
-++// must be called afterwards to flush any used bits
-++// The actual number of valid bits returned is
-++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
-++// will be at least 22 which should be long enough for any prefix or suffix
-++// though probably not long enough for the worst case combination
-++#ifndef get_cabac_by22_peek
-++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
-++{
-++#if USE_BY22_DIV
-++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
-++#else
-++    uint32_t x = c->low & ~1U;
-++    const uint32_t inv = c->range;
-++
-++    if (inv != 0)
-++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
-++
-++    return x << 1;
-++#endif
-++}
-++#endif
-++
-++// Flush bypass bits peeked by _by22_peek
-++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
-++// val is an unmodified copy of whatever _by22_peek returned
-++#ifndef get_cabac_by22_flush
-++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
-++{
-++    // Subtract the bits used & reshift up to the top of the word
-++#if USE_BY22_DIV
-++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
-++#else
-++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
-++#endif
-++
-++    // and refill lower bits
-++    // We will probably OR over some existing bits but that doesn't matter
-++    c->by22.bits += n;
-++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
-++}
-++#endif
-++
-++#endif  // USE_BY22
-++
-++
-+ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
-+ {
-+     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-+@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
-+     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
-+ }
-+ 
-+-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
-++static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
-+ {
-+-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
-++    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
-+ }
-+ 
-+-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
-++static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
-+ {
-+-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
-++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
-+ }
-+ 
-+-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
-++static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
-+ {
-+-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
-++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
-+ }
-+ 
-+ int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
-+@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
-+     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
-+ }
-+ 
-+-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
-++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
-+                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
-+ {
-+     int i = 0;
-+     int max = (log2_size << 1) - 1;
-+     int ctx_offset, ctx_shift;
-+ 
-+-    if (!c_idx) {
-++    if (!c_idx_nz) {
-+         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
-+         ctx_shift = (log2_size + 1) >> 2;
-+     } else {
-+@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
-+     return value;
-+ }
-+ 
-+-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
-++static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
-+ {
-+     int inc;
-+ 
-+-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
-++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
-+ 
-+     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
-+ }
-+-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
-+-                                           int offset, const uint8_t *ctx_idx_map)
-+-{
-+-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
-+-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
-+-}
-+ 
-+-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
-++static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
-+ {
-+     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
-+ }
-+@@ -966,65 +1223,305 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
-+     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
-+ }
-+ 
-+-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
-++
-++#if !USE_BY22
-++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
-++#endif
-++
-++
-++#ifndef coeff_abs_level_remaining_decode_bypass
-++static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
-++{
-++    CABACContext * const c = &s->HEVClc->cc;
-++    uint32_t y;
-++    unsigned int prefix;
-++    unsigned int last_coeff_abs_level_remaining;
-++    unsigned int n;
-++
-++    y = get_cabac_by22_peek(c);
-++    prefix = hevc_clz32(~y);
-++    // y << prefix will always have top bit 0
-++
-++    if (prefix < 3) {
-++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
-++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
-++        n = prefix + 1 + rice_param;
-++    }
-++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
-++    {
-++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
-++
-++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-++        n = prefix * 2 + rice_param - 2;
-++    }
-++    else {
-++        unsigned int suffix;
-++
-++        get_cabac_by22_flush(c, prefix, y);
-++        y = get_cabac_by22_peek(c);
-++
-++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
-++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-++        n = prefix + rice_param - 2;
-++    }
-++
-++    get_cabac_by22_flush(c, n, y);
-++
-++    return last_coeff_abs_level_remaining;
-++}
-++#endif
-++
-++static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
-+ {
-++    CABACContext * const c = &s->HEVClc->cc;
-+     int prefix = 0;
-+     int suffix = 0;
-+     int last_coeff_abs_level_remaining;
-+     int i;
-+ 
-+-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
-++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
-+         prefix++;
-+     if (prefix == CABAC_MAX_BIN) {
-+         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
-+         return 0;
-+     }
-++
-+     if (prefix < 3) {
-+         for (i = 0; i < rc_rice_param; i++)
-+-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
-++            suffix = (suffix << 1) | get_cabac_bypass(c);
-+         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
-+     } else {
-+         int prefix_minus3 = prefix - 3;
-+         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-+-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
-++            suffix = (suffix << 1) | get_cabac_bypass(c);
-+         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
-+                                               << rc_rice_param) + suffix;
-+     }
-++
-+     return last_coeff_abs_level_remaining;
-+ }
-+ 
-+-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
-++#if !USE_BY22
-++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
-++static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
-+ {
-+-    int i;
-+-    int ret = 0;
-++    CABACContext * const c = &s->HEVClc->cc;
-++    unsigned int i;
-++    uint32_t ret = 0;
-+ 
-+     for (i = 0; i < nb; i++)
-+-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
-+-    return ret;
-++        ret = (ret << 1) | get_cabac_bypass(c);
-++
-++    return ret << (32 - nb);
-+ }
-++#endif
-++
-++#ifndef coeff_sign_flag_decode_bypass
-++static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
-++{
-++    CABACContext * const c = &s->HEVClc->cc;
-++    uint32_t y;
-++    y = get_cabac_by22_peek(c);
-++    get_cabac_by22_flush(c, nb, y);
-++    return y & ~(0xffffffffU >> nb);
-++}
-++#endif
-++
-++
-++#ifndef get_cabac_greater1_bits
-++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
-++    uint8_t * const state0)
-++{
-++    unsigned int i;
-++    unsigned int rv = 0;
-++    for (i = 0; i != n; ++i) {
-++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
-++        const unsigned int b = get_cabac(c, state0 + idx);
-++        rv = (rv << 1) | b;
-++    }
-++    return rv;
-++}
-++#endif
-++
-++
-++// N.B. levels returned are the values assuming coeff_abs_level_remaining
-++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
-++// this version of events.
-++static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
-++    int * const pprev_subset_coded, int * const psum,
-++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
-++{
-++    CABACContext * const c = &s->HEVClc->cc;
-++    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
-++    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
-++    unsigned int rv;
-++    unsigned int i;
-++    const unsigned int n = FFMIN(n_end, 8);
-++
-++    // Really this is i != n but the simple unconditional loop is cheaper
-++    // and faster
-++    for (i = 0; i != 8; ++i)
-++        levels[i] = 1;
-++
-++    rv = get_cabac_greater1_bits(c, n, state0);
-++
-++    *pprev_subset_coded = 0;
-++    *psum = n;
-++
-++    rv <<= (32 - n);
-++    if (rv != 0)
-++    {
-++        *pprev_subset_coded = 1;
-++        *psum = n + 1;
-++        i = hevc_clz32(rv);
-++        levels[i] = 2;
-++        if (get_cabac(c, state_gt2) == 0)
-++        {
-++            // Unset first coded bit
-++            rv &= ~(0x80000000U >> i);
-++        }
-++    }
-++
-++    if (n_end > 8) {
-++        const unsigned int g8 = n_end - 8;
-++        rv |= ((1 << g8) - 1) << (24 - g8);
-++        for (i = 0; i != g8; ++i) {
-++            levels[i + 8] = 0;
-++        }
-++    }
-++
-++    return rv;
-++}
-++
-++// extended_precision_processing_flag must be false given we are
-++// putting the result into a 16-bit array
-++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
-++// scale_m is uint8_t
-++//
-++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
-++//   or it can be 2 (if we have transquant_bypass)
-++// shift is set to one less than we really want but would normally be
-++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
-++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
-++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
-++// to achieve it
-++
-++#ifndef trans_scale_sat
-++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-++{
-++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
-++}
-++#endif
-++
-++
-++#ifndef update_rice
-++static inline void update_rice(uint8_t * const stat_coeff,
-++    const unsigned int last_coeff_abs_level_remaining,
-++    const unsigned int c_rice_param)
-++{
-++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
-++    if (x >= 6)
-++        (*stat_coeff)++;
-++    else if (x == 0 && *stat_coeff > 0)
-++        (*stat_coeff)--;
-++}
-++#endif
-++
-++
-++// n must be > 0 on entry
-++#ifndef get_cabac_sig_coeff_flag_idxs
-++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-++    unsigned int n,
-++    const uint8_t const * ctx_map,
-++    uint8_t * p)
-++{
-++    do {
-++        if (get_cabac(c, state0 + ctx_map[n]))
-++            *p++ = n;
-++    } while (--n != 0);
-++    return p;
-++}
-++#endif
-++
-++
-++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-++    unsigned int n,
-++    const uint8_t const * ctx_map,
-++    uint8_t * const flag_idx)
-++{
-++    int rv;
-++
-++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
-++
-++    return rv;
-++}
-++
-++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-++     x0,  x1,  x2,  x3,\
-++     x4,  x5,  x6,  x7,\
-++     x8,  x9, x10, x11,\
-++    x12, x13, x14, x15}
-++
-++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-++     x0,  x4,  x8, x12,\
-++     x1,  x5,  x9, x13,\
-++     x2,  x6, x10, x14,\
-++     x3,  x7, x11, x15}
-++
-++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-++     x0,  x4,  x1,  x8,\
-++     x5,  x2, x12,  x9,\
-++     x6,  x3, x13, x10,\
-++     x7, x14, x11, x15}
-++
-++
-++static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
-++    uint8_t * const significant_coeff_group_flag,
-++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
-++    int * const pPrev_sig)
-++{
-++    while (--i >= 0) {
-++        unsigned int x_cg = scan_x_cg[i];
-++        unsigned int y_cg = scan_y_cg[i];
-++
-++        // For the flag decode we only care about Z/NZ but
-++        // we use the full Right + Down * 2 when calculating
-++        // significant coeff flags so we obtain it here
-++        //.
-++        // The group flag array is one longer than it needs to
-++        // be so we don't need to check for y_cg limits
-++        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
-++            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
-++
-++        if (i == 0 ||
-++            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
-++        {
-++            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
-++            *pPrev_sig = prev_sig;
-++            break;
-++        }
-++    }
-++
-++    return i;
-++}
-++
-+ 
-+ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                                 int log2_trafo_size, enum ScanType scan_idx,
-+                                 int c_idx)
-+ {
-+-#define GET_COORD(offset, n)                                    \
-+-    do {                                                        \
-+-        x_c = (x_cg << 2) + scan_x_off[n];                      \
-+-        y_c = (y_cg << 2) + scan_y_off[n];                      \
-+-    } while (0)
-+-    HEVCLocalContext *lc = s->HEVClc;
-+-    int transform_skip_flag = 0;
-++    HEVCLocalContext * const lc = s->HEVClc;
-++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
-+ 
-+     int last_significant_coeff_x, last_significant_coeff_y;
-+-    int last_scan_pos;
-+-    int n_end;
-+     int num_coeff = 0;
-+-    int greater1_ctx = 1;
-++    int prev_subset_coded = 0;
-+ 
-+     int num_last_subset;
-+     int x_cg_last_sig, y_cg_last_sig;
-+ 
-+-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
-++    const uint8_t *scan_x_cg, *scan_y_cg;
-++    const xy_off_t * scan_xy_off;
-+ 
-+     ptrdiff_t stride = s->frame->linesize[c_idx];
-+     int hshift = s->ps.sps->hshift[c_idx];
-+@@ -1032,21 +1529,28 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-+                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-+ #ifdef RPI
-+-    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag && !transform_skip_flag && !lc->tu.cross_pf && log2_trafo_size>=4;
-++    //***** transform_skip_flag decoded later!
-++    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
-+ #endif
-+     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
-++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
-+     int explicit_rdpcm_flag = 0;
-+     int explicit_rdpcm_dir_flag;
-+ 
-+     int trafo_size = 1 << log2_trafo_size;
-+     int i;
-+-    int qp,shift,add,scale,scale_m;
-+-    const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-++    int qp,shift,scale;
-++    static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-+     const uint8_t *scale_matrix = NULL;
-+     uint8_t dc_scale;
-+     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
-+                                          lc->tu.intra_pred_mode_c;
-++
-++    int prev_sig = 0;
-++    const int c_idx_nz = (c_idx != 0);
-++
-++    int may_hide_sign;
-++
-+ #ifdef RPI
-+     if (s->enable_rpi) {
-+         int n = trafo_size * trafo_size;
-+@@ -1078,7 +1582,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+ 
-+     // Derive QP for dequant
-+     if (!lc->cu.cu_transquant_bypass_flag) {
-+-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
-++        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
-+         static const uint8_t rem6[51 + 4 * 6 + 1] = {
-+             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
-+             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-+@@ -1094,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+         };
-+         int qp_y = lc->qp_y;
-+ 
-++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
-++
-+         if (s->ps.pps->transform_skip_enabled_flag &&
-+             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-+-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
-++            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
-++            if (transform_skip_flag) {
-++                trans_skip_or_bypass = 1;
-++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
-++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
-++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
-++                    may_hide_sign = 0;
-++                }
-++            }
-+         }
-+ 
-+         if (c_idx == 0) {
-+@@ -1129,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             qp += s->ps.sps->qp_bd_offset;
-+         }
-+ 
-+-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
-+-        add      = 1 << (shift-1);
-+-        scale    = level_scale[rem6[qp]] << (div6[qp]);
-+-        scale_m  = 16; // default when no custom scaling lists.
-+-        dc_scale = 16;
-++        // Shift is set to one less than will actually occur as the scale
-++        // and saturate step adds 1 and then shifts right again
-++        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
-++        scale = level_scale[rem6[qp]];
-++        if (div6[qp] >= shift) {
-++            scale <<= (div6[qp] - shift);
-++            shift = 0;
-++        } else {
-++            shift -= div6[qp];
-++        }
-+ 
-+-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
-++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
-+             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
-+-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-+             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
-+ 
-+             matrix_id = 3 * matrix_id + c_idx;
-+ 
-+             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-++            dc_scale = scale_matrix[0];
-+             if (log2_trafo_size >= 4)
-+                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-+         }
-++        else
-++        {
-++            static const uint8_t sixteen_scale[64] = {
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16,
-++                16, 16, 16, 16, 16, 16, 16, 16
-++            };
-++            scale_matrix = sixteen_scale;
-++            dc_scale = 16;
-++        }
-+     } else {
-++        static const uint8_t unit_scale[64] = {
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++            1, 1, 1, 1, 1, 1, 1, 1,
-++        };
-++        scale_matrix = unit_scale;
-+         shift        = 0;
-+-        add          = 0;
-+-        scale        = 0;
-+-        dc_scale     = 0;
-++        scale        = 2;  // We will shift right to kill this
-++        dc_scale     = 1;
-++
-++        may_hide_sign = 0;
-+     }
-+ 
-+     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-+-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-+-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
-++        trans_skip_or_bypass) {
-++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
-+         if (explicit_rdpcm_flag) {
-+-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
-++            may_hide_sign = 0;
-++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
-+         }
-+     }
-+ 
-+-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
-++    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
-+                                            &last_significant_coeff_x, &last_significant_coeff_y);
-+ 
-+     if (last_significant_coeff_x > 3) {
-+@@ -1189,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+         int last_x_c = last_significant_coeff_x & 3;
-+         int last_y_c = last_significant_coeff_y & 3;
-+ 
-+-        scan_x_off = ff_hevc_diag_scan4x4_x;
-+-        scan_y_off = ff_hevc_diag_scan4x4_y;
-+         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-+-        if (trafo_size == 4) {
-++
-++        switch (log2_trafo_size) {
-++        case 2:
-+             scan_x_cg = scan_1x1;
-+             scan_y_cg = scan_1x1;
-+-        } else if (trafo_size == 8) {
-++            break;
-++        case 3:
-+             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+             scan_x_cg = diag_scan2x2_x;
-+             scan_y_cg = diag_scan2x2_y;
-+-        } else if (trafo_size == 16) {
-++            break;
-++        case 4:
-+             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+             scan_x_cg = ff_hevc_diag_scan4x4_x;
-+             scan_y_cg = ff_hevc_diag_scan4x4_y;
-+-        } else { // trafo_size == 32
-++            break;
-++        case 5:
-++        default:
-+             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+             scan_x_cg = ff_hevc_diag_scan8x8_x;
-+             scan_y_cg = ff_hevc_diag_scan8x8_y;
-++            break;
-+         }
-+         break;
-+     }
-+     case SCAN_HORIZ:
-+         scan_x_cg = horiz_scan2x2_x;
-+         scan_y_cg = horiz_scan2x2_y;
-+-        scan_x_off = horiz_scan4x4_x;
-+-        scan_y_off = horiz_scan4x4_y;
-+         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-+         break;
-+     default: //SCAN_VERT
-+         scan_x_cg = horiz_scan2x2_y;
-+         scan_y_cg = horiz_scan2x2_x;
-+-        scan_x_off = horiz_scan4x4_y;
-+-        scan_y_off = horiz_scan4x4_x;
-+         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-+         break;
-+     }
-+     num_coeff++;
-+     num_last_subset = (num_coeff - 1) >> 4;
-+ 
-+-    for (i = num_last_subset; i >= 0; i--) {
-+-        int n, m;
-+-        int x_cg, y_cg, x_c, y_c, pos;
-+-        int implicit_non_zero_coeff = 0;
-+-        int64_t trans_coeff_level;
-+-        int prev_sig = 0;
-+-        int offset = i << 4;
-+-        int rice_init = 0;
-+-
-+-        uint8_t significant_coeff_flag_idx[16];
-+-        uint8_t nb_significant_coeff_flag = 0;
-++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+ 
-+-        x_cg = scan_x_cg[i];
-+-        y_cg = scan_y_cg[i];
-++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+ 
-+-        if ((i < num_last_subset) && (i > 0)) {
-+-            int ctx_cg = 0;
-+-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
-+-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
-+-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
-+-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
-+-
-+-            significant_coeff_group_flag[x_cg][y_cg] =
-+-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
-+-            implicit_non_zero_coeff = 1;
-+-        } else {
-+-            significant_coeff_group_flag[x_cg][y_cg] =
-+-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
-+-             (x_cg == 0 && y_cg == 0));
-+-        }
-++    i = num_last_subset;
-++    do {
-++        int implicit_non_zero_coeff = 0;
-++        int n_end;
-+ 
-+-        last_scan_pos = num_coeff - offset - 1;
-++        uint8_t significant_coeff_flag_idx[16];
-++        unsigned int nb_significant_coeff_flag = 0;
-+ 
-+         if (i == num_last_subset) {
-++            // First time through
-++            int last_scan_pos = num_coeff - (i << 4) - 1;
-+             n_end = last_scan_pos - 1;
-+             significant_coeff_flag_idx[0] = last_scan_pos;
-+             nb_significant_coeff_flag = 1;
-+         } else {
-+             n_end = 15;
-++            implicit_non_zero_coeff = (i != 0);
-+         }
-+ 
-+-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
-+-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
-+-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
-+-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
-+-
-+-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
-+-            static const uint8_t ctx_idx_map[] = {
-+-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
-+-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
-+-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
-+-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
-+-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
-++        if (n_end >= 0) {
-++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
-++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
-++            };
-++            static const uint8_t ctx_idx_maps[3][4][16] = {
-++                {
-++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-++                },
-++                {
-++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-++                },
-++                {
-++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
-++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-++                }
-+             };
-+             const uint8_t *ctx_idx_map_p;
-+             int scf_offset = 0;
-+-            if (s->ps.sps->transform_skip_context_enabled_flag &&
-+-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-+-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
-+-                if (c_idx == 0) {
-+-                    scf_offset = 40;
-+-                } else {
-+-                    scf_offset = 14 + 27;
-+-                }
-++
-++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-++                ctx_idx_map_p = ctx_idx_maps[0][3];
-++                scf_offset = 40 + c_idx_nz;
-+             } else {
-+-                if (c_idx != 0)
-++                if (c_idx_nz != 0)
-+                     scf_offset = 27;
-++
-+                 if (log2_trafo_size == 2) {
-+-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
-++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
-+                 } else {
-+-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
-+-                    if (c_idx == 0) {
-+-                        if ((x_cg > 0 || y_cg > 0))
-++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
-++                    if (!c_idx_nz) {
-++                        if (i != 0)
-+                             scf_offset += 3;
-++
-+                         if (log2_trafo_size == 3) {
-+                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
-+                         } else {
-+@@ -1315,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+                     }
-+                 }
-+             }
-+-            for (n = n_end; n > 0; n--) {
-+-                x_c = scan_x_off[n];
-+-                y_c = scan_y_off[n];
-+-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
-+-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-+-                    nb_significant_coeff_flag++;
-++
-++            if (n_end > 0) {
-++                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
-++                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
-++                    n_end, ctx_idx_map_p,
-++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
-++
-++                nb_significant_coeff_flag += cnt;
-++                if (cnt != 0) {
-+                     implicit_non_zero_coeff = 0;
-+                 }
-+             }
-++
-+             if (implicit_non_zero_coeff == 0) {
-+-                if (s->ps.sps->transform_skip_context_enabled_flag &&
-+-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-+-                    if (c_idx == 0) {
-+-                        scf_offset = 42;
-+-                    } else {
-+-                        scf_offset = 16 + 27;
-+-                    }
-++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-++                    scf_offset = 42 + c_idx_nz;
-+                 } else {
-+                     if (i == 0) {
-+-                        if (c_idx == 0)
-+-                            scf_offset = 0;
-+-                        else
-+-                            scf_offset = 27;
-++                        scf_offset = c_idx_nz ? 27 : 0;
-+                     } else {
-+                         scf_offset = 2 + scf_offset;
-+                     }
-+                 }
-+-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
-++                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
-+                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+                     nb_significant_coeff_flag++;
-+                 }
-+@@ -1352,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             }
-+         }
-+ 
-+-        n_end = nb_significant_coeff_flag;
-+-
-++        if (nb_significant_coeff_flag != 0) {
-++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
-++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
-++                prev_subset_coded;
-++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
-++                (gt1_idx_delta << 2);
-++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
-++                gt1_idx_delta;
-++
-++            const unsigned int x_cg = scan_x_cg[i];
-++            const unsigned int y_cg = scan_y_cg[i];
-++            int16_t * const blk_coeffs = coeffs +
-++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
-++            // This calculation is 'wrong' for log2_traffo_size == 2
-++            // but that doesn't mattor as in this case x_cg & y_cg
-++            // are always 0 so result is correct (0) anyway
-++            const uint8_t * const blk_scale = scale_matrix +
-++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
-++
-++            // * THe following code block doesn't deal with these flags:
-++            //   (nor did the one it replaces)
-++            //
-++            // cabac_bypass_alignment_enabled_flag
-++            //    This should be easy but I can't find a test case
-++            // extended_precision_processing_flag
-++            //    This can extend the required precision past 16bits
-++            //    so is probably tricky - also no example found yet
-++
-++#if USE_N_END_1
-++            if (nb_significant_coeff_flag == 1) {
-++                // There is a small gain to be had from special casing the single
-++                // transform coefficient case.  The reduction in complexity
-++                // makes up for the code duplicatioon.
-++
-++                int trans_coeff_level = 1;
-++                int coeff_sign_flag;
-++                int coded_val = 0;
-++
-++                // initialize first elem of coeff_bas_level_greater1_flag
-++                prev_subset_coded = 0;
-++
-++                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
-++                    trans_coeff_level = 2;
-++                    prev_subset_coded = 1;
-++                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
-++                }
-+ 
-+-        if (n_end) {
-+-            int first_nz_pos_in_cg;
-+-            int last_nz_pos_in_cg;
-+-            int c_rice_param = 0;
-+-            int first_greater1_coeff_idx = -1;
-+-            uint8_t coeff_abs_level_greater1_flag[8];
-+-            uint16_t coeff_sign_flag;
-+-            int sum_abs = 0;
-+-            int sign_hidden;
-+-            int sb_type;
-++                // Probably not worth the overhead of starting by22 for just one value
-++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
-+ 
-++                if (coded_val)
-++                {
-++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
-++                    } else {
-++                        uint8_t * const stat_coeff =
-++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-++                        const unsigned int c_rice_param = *stat_coeff >> 2;
-++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-+ 
-+-            // initialize first elem of coeff_bas_level_greater1_flag
-+-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
-++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-++                    }
-++                }
-+ 
-+-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-+-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
-+-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
-+-                else
-+-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
-+-                c_rice_param = lc->stat_coeff[sb_type] / 4;
-+-            }
-++                {
-++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-++                    const unsigned int scale_m = blk_scale[xy_off->scale];
-+ 
-+-            if (!(i == num_last_subset) && greater1_ctx == 0)
-+-                ctx_set++;
-+-            greater1_ctx = 1;
-+-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-+-
-+-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
-+-                int inc = (ctx_set << 2) + greater1_ctx;
-+-                coeff_abs_level_greater1_flag[m] =
-+-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
-+-                if (coeff_abs_level_greater1_flag[m]) {
-+-                    greater1_ctx = 0;
-+-                    if (first_greater1_coeff_idx == -1)
-+-                        first_greater1_coeff_idx = m;
-+-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
-+-                    greater1_ctx++;
-++                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
-++                        (trans_coeff_level ^ k) - k,  // Apply sign
-++                        scale,
-++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
-++                        shift);
-+                 }
-+             }
-+-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-+-
-+-            if (lc->cu.cu_transquant_bypass_flag ||
-+-                (lc->cu.pred_mode ==  MODE_INTRA  &&
-+-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
-+-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
-+-                 explicit_rdpcm_flag)
-+-                sign_hidden = 0;
-+             else
-+-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
-++#endif
-++            {
-++                int sign_hidden = may_hide_sign;
-++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
-++                uint32_t coeff_sign_flags;
-++                uint32_t coded_vals = 0;
-++                // Sum(abs(level[]))
-++                // In fact we only need the bottom bit and in some future
-++                // version that may be all we calculate
-++                unsigned int sum_abs;
-++
-++                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
-++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
-++
-++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
-++                    sign_hidden = 0;
-++
-++                // -- Start bypass block
-++
-++                bypass_start(s);
-++
-++                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
-++
-++                if (coded_vals != 0)
-++                {
-++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
-++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
-++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
-++                    int * level = levels - 1;
-++
-++                    do {
-++                        {
-++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
-++                            level += z;
-++                            coded_vals <<= z;
-++                        }
-+ 
-+-            if (first_greater1_coeff_idx != -1) {
-+-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
-+-            }
-+-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
-+-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
-+-            } else {
-+-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
-+-            }
-++                        {
-++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
-++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-++
-++                            sum_abs += last_coeff_abs_level_remaining + 1;
-++                            *level = trans_coeff_level;
-+ 
-+-            for (m = 0; m < n_end; m++) {
-+-                n = significant_coeff_flag_idx[m];
-+-                GET_COORD(offset, n);
-+-                if (m < 8) {
-+-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
-+-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
-+-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-+-
-+-                        trans_coeff_level += last_coeff_abs_level_remaining;
-+-                        if (trans_coeff_level > (3 << c_rice_param))
-+-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-+-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-+-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-+-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-+-                                lc->stat_coeff[sb_type]++;
-+-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-+-                                if (lc->stat_coeff[sb_type] > 0)
-+-                                    lc->stat_coeff[sb_type]--;
-+-                            rice_init = 1;
-++                            if (stat_coeff != NULL)
-++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-++                            stat_coeff = NULL;
-++
-++                            if (trans_coeff_level > (3 << c_rice_param) &&
-++                                (c_rice_param < 4 || rice_adaptation_enabled))
-++                                ++c_rice_param;
-+                         }
-+-                    }
-+-                } else {
-+-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-+-
-+-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
-+-                    if (trans_coeff_level > (3 << c_rice_param))
-+-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-+-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-+-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-+-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-+-                            lc->stat_coeff[sb_type]++;
-+-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-+-                            if (lc->stat_coeff[sb_type] > 0)
-+-                                lc->stat_coeff[sb_type]--;
-+-                        rice_init = 1;
-+-                    }
-++                    } while (coded_vals != 0);
-+                 }
-+-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
-+-                    sum_abs += trans_coeff_level;
-+-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
-+-                        trans_coeff_level = -trans_coeff_level;
-++
-++                // sign_hidden = 0 or 1 so we can combine the tests
-++                if ((sign_hidden & sum_abs) != 0) {
-++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
-+                 }
-+-                if (coeff_sign_flag >> 15)
-+-                    trans_coeff_level = -trans_coeff_level;
-+-                coeff_sign_flag <<= 1;
-+-                if(!lc->cu.cu_transquant_bypass_flag) {
-+-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
-+-                        if(y_c || x_c || log2_trafo_size < 4) {
-+-                            switch(log2_trafo_size) {
-+-                                case 3: pos = (y_c << 3) + x_c; break;
-+-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
-+-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
-+-                                default: pos = (y_c << 2) + x_c; break;
-+-                            }
-+-                            scale_m = scale_matrix[pos];
-+-                        } else {
-+-                            scale_m = dc_scale;
-+-                        }
-++
-++                bypass_finish(s);
-++
-++                // -- Finish bypass block
-++
-++                // Scale loop
-++                {
-++                    int m = nb_significant_coeff_flag - 1;
-++
-++                    // Deal with DC component (if any) first
-++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
-++                    {
-++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-++                        blk_coeffs[0] = trans_scale_sat(
-++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
-++                        --m;
-+                     }
-+-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
-+-                    if(trans_coeff_level < 0) {
-+-                        if((~trans_coeff_level) & 0xFffffffffff8000)
-+-                            trans_coeff_level = -32768;
-+-                    } else {
-+-                        if(trans_coeff_level & 0xffffffffffff8000)
-+-                            trans_coeff_level = 32767;
-++
-++#if !USE_N_END_1
-++                    // If N_END_! set then m was at least 1 initially
-++                    if (m >= 0)
-++#endif
-++                    {
-++                        do {
-++                            const xy_off_t * const xy_off = scan_xy_off +
-++                                significant_coeff_flag_idx[m];
-++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-++
-++                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
-++                                (levels[m] ^ k) - k,
-++                                scale,
-++                                blk_scale[xy_off->scale],
-++                                shift);
-++                        } while (--m >= 0);
-+                     }
-+                 }
-+-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
-++
-+             }
-+         }
-+-    }
-++    } while ((i = next_subset(s, i, c_idx_nz,
-++        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
-+ 
-+     if (lc->cu.cu_transquant_bypass_flag) {
-+         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+@@ -1496,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-+             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+         }
-+     } else {
-+-        if (transform_skip_flag) {
-++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
-+             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
-+                       log2_trafo_size == 2 &&
-+                       lc->cu.pred_mode == MODE_INTRA;
-+-- 
-+2.5.0
-+
-diff --git a/tools/depends/target/ffmpeg/Makefile b/tools/depends/target/ffmpeg/Makefile
-index e4acfa9..072e711 100644
---- a/tools/depends/target/ffmpeg/Makefile
-+++ b/tools/depends/target/ffmpeg/Makefile
-@@ -4,7 +4,8 @@ DEPS= ../../Makefile.include FFMPEG-VERSION Makefile ffmpeg_Speed_up_wtv_index_c
-   0001-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch \
-   0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch \
-   hevcdsp_ARM_NEON_optimized_epel_functions.patch added_ARM_NEON_optimized_SAO_patches.patch \
--  pfcd_hevc_optimisations.patch
-+  pfcd_hevc_optimisations.patch \
-+  0001-Squashed-commit-of-the-following.patch
- 
- # set to "yes" to enable patching
- # we don't apply patches until we move to a vanilla ffmpeg tarball
-@@ -66,6 +67,7 @@ ifeq ($(Configuration), Release)
-   ffmpg_config += --disable-debug
- endif
- 
-+ffmpg_config += --extra-cflags="-DRPI=1"
- 
- CLEAN_FILES=$(ARCHIVE) $(PLATFORM)
- 
-@@ -84,6 +86,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
- 	cd $(PLATFORM); patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
- 	cd $(PLATFORM); patch -p1 < ../pfcd_hevc_optimisations.patch
-+	cd $(PLATFORM); patch -p1 < ../0001-Squashed-commit-of-the-following.patch
- 
- 	cd $(PLATFORM);\
- 	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
-
-From 653e6185b0976bd50eea79f9834ede99db13e3aa Mon Sep 17 00:00:00 2001
-From: Rainer Hochecker <fernetmenta@online.de>
-Date: Tue, 14 Jul 2015 08:30:44 +0200
-Subject: [PATCH 65/93] fix high cpu load caused by false positive frame
- limiter
-
----
- xbmc/Application.cpp | 20 +++++++++-----------
- 1 file changed, 9 insertions(+), 11 deletions(-)
-
-diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index 212a5c7..1adbb01 100644
---- a/xbmc/Application.cpp
-+++ b/xbmc/Application.cpp
-@@ -1902,7 +1902,7 @@ void CApplication::Render()
- 
-   bool hasRendered = false;
-   bool limitFrames = false;
--  unsigned int singleFrameTime = 10; // default limit 100 fps
-+  unsigned int singleFrameTime = 40; // default limit 25 fps
-   bool vsync = true;
- 
-   // Whether externalplayer is playing and we're unfocused
-@@ -1916,24 +1916,22 @@ void CApplication::Render()
-     if (!extPlayerActive && g_graphicsContext.IsFullScreenVideo() && !m_pPlayer->IsPausedPlayback())
-     {
-       m_bPresentFrame = g_renderManager.HasFrame();
--      if (vsync_mode == VSYNC_DISABLED)
--        vsync = false;
-     }
-     else
-     {
-       // engage the frame limiter as needed
-       limitFrames = lowfps || extPlayerActive;
--      // DXMERGE - we checked for g_videoConfig.GetVSyncMode() before this
--      //           perhaps allowing it to be set differently than the UI option??
-+
-+      // TODO:
-+      // remove those useless modes, they don't do any good
-       if (vsync_mode == VSYNC_DISABLED || vsync_mode == VSYNC_VIDEO)
-       {
-         limitFrames = true; // not using vsync.
--        vsync = false;
-+        singleFrameTime = 10;
-       }
--      else if ((g_infoManager.GetFPS() > g_graphicsContext.GetFPS() + 10) && g_infoManager.GetFPS() > 1000.0f / singleFrameTime)
-+      else if ((g_infoManager.GetFPS() > g_graphicsContext.GetFPS() + 10) && g_infoManager.GetFPS() > 100.0f)
-       {
-         limitFrames = true; // using vsync, but it isn't working.
--        vsync = false;
-       }
- 
-       if (limitFrames)
-@@ -1957,7 +1955,10 @@ void CApplication::Render()
-   else if (vsync_mode == VSYNC_ALWAYS)
-     g_Windowing.SetVSync(true);
-   else if (vsync_mode != VSYNC_DRIVER)
-+  {
-     g_Windowing.SetVSync(false);
-+    vsync = false;
-+  }
- 
-   if (m_bPresentFrame && m_pPlayer->IsPlaying() && !m_pPlayer->IsPaused())
-     ResetScreenSaver();
-@@ -2025,9 +2026,6 @@ void CApplication::Render()
-   //fps limiter, make sure each frame lasts at least singleFrameTime milliseconds
-   if (limitFrames || !(flip || m_bPresentFrame))
-   {
--    if (!limitFrames)
--      singleFrameTime = 40; //if not flipping, loop at 25 fps
--
-     unsigned int frameTime = now - m_lastFrameTime;
-     if (frameTime < singleFrameTime)
-       Sleep(singleFrameTime - frameTime);
-
-From 7eae470ce134f19cb5002969ac3f7e85fcf5220d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 5 Aug 2015 13:43:25 +0100
-Subject: [PATCH 66/93] [dvdplayeraudio] Avoid busy spinning when queue is
- empty
-
----
- xbmc/cores/dvdplayer/DVDPlayerAudio.cpp | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp b/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-index 97a23a6..9f21a19 100644
---- a/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-+++ b/xbmc/cores/dvdplayer/DVDPlayerAudio.cpp
-@@ -541,6 +541,8 @@ void CDVDPlayerAudio::Process()
-         m_dvdAudio.Drain();
-         m_dvdAudio.Flush();
-         m_stalled = true;
-+        // avoid busy spinning here
-+        Sleep(10);
-       }
- 
-       continue;
-
-From 492a2e7ac5fb1895b71b62f68918e74db053f0b9 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 6 Aug 2015 11:23:05 +0100
-Subject: [PATCH 67/93] [rbp] Make sync playback to display the default option
-
----
- system/settings/rbp.xml | 7 +++++++
- 1 file changed, 7 insertions(+)
-
-diff --git a/system/settings/rbp.xml b/system/settings/rbp.xml
-index 1506035..f2a6892 100644
---- a/system/settings/rbp.xml
-+++ b/system/settings/rbp.xml
-@@ -1,6 +1,13 @@
- <?xml version="1.0" encoding="utf-8" ?>
- <settings>
-   <section id="videos">
-+    <category id="videoplayer">
-+      <group id="3">
-+        <setting id="videoplayer.usedisplayasclock">
-+          <default>true</default>
-+        </setting>
-+      </group>
-+    </category>
-     <category id="videoacceleration">
-       <group id="1">
-         <visible>false</visible>
-
-From 3ff59db3bd9c43b037bbe89c72f5fd97f4563b71 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 8 Sep 2015 23:42:30 +0100
-Subject: [PATCH 68/93] [cec] Fixing initialisation issue found on Raspberry Pi
- with Buildroot
-
----
- tools/depends/target/libcec/Makefile               |  1 +
- ...ssue-found-on-Raspberry-Pi-with-Buildroot.patch | 24 ++++++++++++++++++++++
- 2 files changed, 25 insertions(+)
- create mode 100644 tools/depends/target/libcec/fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch
-
-diff --git a/tools/depends/target/libcec/Makefile b/tools/depends/target/libcec/Makefile
-index 5d1f933..4663faa 100644
---- a/tools/depends/target/libcec/Makefile
-+++ b/tools/depends/target/libcec/Makefile
-@@ -23,6 +23,7 @@ $(PLATFORM): $(TARBALLS_LOCATION)/$(ARCHIVE) $(DEPS)
- 	cd $(PLATFORM); $(ARCHIVE_TOOL) $(ARCHIVE_TOOL_FLAGS) $(TARBALLS_LOCATION)/$(ARCHIVE)
- 	cd $(PLATFORM); patch -p1 < ../popcornmix.patch
- 	cd $(PLATFORM); patch -p1 < ../0001-re-implement-RFC-style-POLLing-for-LA-registering-pr.patch
-+	cd $(PLATFORM); patch -p1 < ../fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch
- 	cd $(PLATFORM)/build; $(CMAKE) -DBUILD_SHARED_LIBS=1 -DSKIP_PYTHON_WRAPPER:STRING=1 -DCMAKE_INSTALL_LIBDIR=$(PREFIX)/lib ..
- 
- $(LIBDYLIB): $(PLATFORM)
-diff --git a/tools/depends/target/libcec/fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch b/tools/depends/target/libcec/fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch
-new file mode 100644
-index 0000000..8f289f2
---- /dev/null
-+++ b/tools/depends/target/libcec/fixing-initialisation-issue-found-on-Raspberry-Pi-with-Buildroot.patch
-@@ -0,0 +1,24 @@
-+From 9a252570dc3ca1f5b92a48542e29b2722550e670 Mon Sep 17 00:00:00 2001
-+From: Erwan LOUET <erwan.louet@orange.com>
-+Date: Fri, 4 Sep 2015 15:34:19 +0200
-+Subject: [PATCH] fixing initialisation issue found on Raspberry Pi with
-+ Buildroot
-+
-+---
-+ src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp | 3 ++-
-+ 1 file changed, 2 insertions(+), 1 deletion(-)
-+
-+diff --git a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-+index 6f0804d..95b4fef 100644
-+--- a/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-++++ b/src/libcec/adapter/RPi/RPiCECAdapterCommunication.cpp
-+@@ -71,7 +71,8 @@ CRPiCECAdapterCommunication::CRPiCECAdapterCommunication(IAdapterCommunicationCa
-+     m_bLogicalAddressChanged(false),
-+     m_previousLogicalAddress(CECDEVICE_FREEUSE),
-+     m_bLogicalAddressRegistered(false),
-+-    m_bDisableCallbacks(false)
-++    m_bDisableCallbacks(false),
-++    m_bInitialised(false)
-+ {
-+   m_queue = new CRPiCECAdapterMessageQueue(this);
-+ }
-
-From c456ad03e68428ef849490c385cc069cb8dde87d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 9 Dec 2015 13:31:14 +0000
-Subject: [PATCH 69/93] [mmalcodec] Fail to open when width is invalid. Can
- happen with mpegts files
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 3345685..5386b4a 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -532,6 +532,10 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-     CLog::Log(LOGDEBUG, "%s::%s usemmal:%d software:%d %dx%d pool:%p", CLASSNAME, __func__, CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL), hints.software, hints.width, hints.height, options.m_opaque_pointer);
- 
-+  // This occurs at start of m2ts files before streams have been fully identified - just ignore
-+  if (!hints.width)
-+    return false;
-+
-   // we always qualify even if DVDFactoryCodec does this too.
-   if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
-     return false;
-
-From 9bcbb1f3c5e687ccf4aeecbe583eb7643f5d48c8 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 21 Dec 2015 18:34:06 +0000
-Subject: [PATCH 70/93] [mmalrender] Force a SetVideoRect after a reconfigure
-
-There has been an issue with dvd stills and a hdmi refresh rate change.
-The hdmi mode change loses the currently displayed picture.
-Not an issue for normal video playback as another picture will be along soon.
-Not the case in DVD menus.
-
-SetVideoRect makes the last picture redisplay and so fixes up the menu.
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index adf6f73..ad3f66f 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -286,6 +286,10 @@ bool CMMALRenderer::Configure(unsigned int width, unsigned int height, unsigned
-   m_fps = fps;
-   m_iFlags = flags;
- 
-+  // cause SetVideoRect to trigger - needed after a hdmi mode change
-+  m_src_rect.SetRect(0, 0, 0, 0);
-+  m_dst_rect.SetRect(0, 0, 0, 0);
-+
-   CLog::Log(LOGDEBUG, "%s::%s - %dx%d->%dx%d@%.2f flags:%x format:%d ext:%x orient:%d", CLASSNAME, __func__, width, height, d_width, d_height, fps, flags, format, extended_format, orientation);
-   if (format != RENDER_FMT_YUV420P && format != RENDER_FMT_BYPASS && format != RENDER_FMT_MMAL)
-   {
-
-From 902a0514368d1ec48107d5951ee990b93cb4282f Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 31 Mar 2015 17:31:47 +0100
-Subject: [PATCH 71/93] mmalcodec: Add SetCodecControl function
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 10 +++++++++-
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  2 ++
- 2 files changed, 11 insertions(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 5386b4a..799b708 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -130,6 +130,7 @@ CMMALVideo::CMMALVideo()
-   m_speed = DVD_PLAYSPEED_NORMAL;
-   m_fps = 0.0f;
-   m_num_decoded = 0;
-+  m_codecControlFlags = 0;
- }
- 
- CMMALVideo::~CMMALVideo()
-@@ -875,7 +876,7 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-   }
- 
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
--    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) inputs(%d) slept(%d) queued(%.2f) (%.2f:%.2f) full(%d)", CLASSNAME, __func__, ret, m_output_ready.size(), mmal_queue_length(m_dec_input_pool->queue), slept, queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full);
-+    CLog::Log(LOGDEBUG, "%s::%s - ret(%x) pics(%d) inputs(%d) slept(%d) queued(%.2f) (%.2f:%.2f) full(%d) flags(%x)", CLASSNAME, __func__, ret, m_output_ready.size(), mmal_queue_length(m_dec_input_pool->queue), slept, queued*1e-6, m_demuxerPts*1e-6, m_decoderPts*1e-6, full, m_codecControlFlags);
- 
-   return ret;
- }
-@@ -939,6 +940,7 @@ void CMMALVideo::Reset(void)
-   }
-   m_decoderPts = DVD_NOPTS_VALUE;
-   m_demuxerPts = DVD_NOPTS_VALUE;
-+  m_codecControlFlags = 0;
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-@@ -1049,3 +1051,9 @@ bool CMMALVideo::GetCodecStats(double &pts, int &droppedPics)
-   droppedPics= -1;
-   return false;
- }
-+
-+void CMMALVideo::SetCodecControl(int flags)
-+{
-+  CSingleLock lock(m_sharedSection);
-+  m_codecControlFlags = flags;
-+}
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index f4df09c..37d0868 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -79,6 +79,7 @@ public:
-   virtual void SetDropState(bool bDrop);
-   virtual const char* GetName(void) { return m_pFormatName ? m_pFormatName:"mmal-xxx"; }
-   virtual bool GetCodecStats(double &pts, int &droppedPics);
-+  virtual void SetCodecControl(int flags);
-   virtual void SetSpeed(int iSpeed);
- 
-   // MMAL decoder callback routines.
-@@ -121,6 +122,7 @@ protected:
-   double            m_demuxerPts;
-   double            m_decoderPts;
-   int               m_speed;
-+  int               m_codecControlFlags;
- 
-   CCriticalSection m_sharedSection;
-   MMAL_COMPONENT_T *m_dec;
-
-From cae99d2093015ba70d1a387e83ed6214393fc31a Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 27 Dec 2015 18:44:22 +0000
-Subject: [PATCH 72/93] mmalcodec: Switch to a condition variable when blocking
- waiting for a picture
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 38 ++++++++++++----------
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   |  3 +-
- 2 files changed, 23 insertions(+), 18 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 799b708..3579966 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -102,7 +102,6 @@ CMMALVideo::CMMALVideo()
- {
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-     CLog::Log(LOGDEBUG, "%s::%s %p", CLASSNAME, __func__, this);
--  pthread_mutex_init(&m_output_mutex, NULL);
- 
-   m_decoded_width = 0;
-   m_decoded_height = 0;
-@@ -141,7 +140,6 @@ CMMALVideo::~CMMALVideo()
-     Dispose();
- 
-   CSingleLock lock(m_sharedSection);
--  pthread_mutex_destroy(&m_output_mutex);
- 
-   if (m_deint && m_deint->control && m_deint->control->is_enabled)
-     mmal_port_disable(m_deint->control);
-@@ -285,9 +283,11 @@ void CMMALVideo::dec_output_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buf
-         omvb->width = m_decoded_width;
-         omvb->height = m_decoded_height;
-         omvb->m_aspect_ratio = m_aspect_ratio;
--        pthread_mutex_lock(&m_output_mutex);
--        m_output_ready.push(omvb);
--        pthread_mutex_unlock(&m_output_mutex);
-+        {
-+          CSingleLock lock(m_output_mutex);
-+          m_output_ready.push(omvb);
-+          m_output_cond.notifyAll();
-+        }
-         kept = true;
-       }
-     }
-@@ -867,7 +867,8 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-     {
-       // otherwise we busy spin
-       CSingleExit unlock(m_sharedSection);
--      Sleep(10);
-+      CSingleLock lock(m_output_mutex);
-+      m_output_cond.wait(lock, 10);
-     }
-     if (!m_output_ready.empty())
-       ret |= VC_PICTURE;
-@@ -916,14 +917,16 @@ void CMMALVideo::Reset(void)
-   while (1)
-   {
-     CMMALVideoBuffer *buffer = NULL;
--    pthread_mutex_lock(&m_output_mutex);
--    // fetch a output buffer and pop it off the ready list
--    if (!m_output_ready.empty())
-     {
--      buffer = m_output_ready.front();
--      m_output_ready.pop();
-+      CSingleLock lock(m_output_mutex);
-+      // fetch a output buffer and pop it off the ready list
-+      if (!m_output_ready.empty())
-+      {
-+        buffer = m_output_ready.front();
-+        m_output_ready.pop();
-+      }
-+      m_output_cond.notifyAll();
-     }
--    pthread_mutex_unlock(&m_output_mutex);
-     if (buffer)
-     {
-       buffer->Acquire();
-@@ -984,11 +987,12 @@ bool CMMALVideo::GetPicture(DVDVideoPicture* pDvdVideoPicture)
-   {
-     CMMALVideoBuffer *buffer;
-     // fetch a output buffer and pop it off the ready list
--    pthread_mutex_lock(&m_output_mutex);
--    buffer = m_output_ready.front();
--    m_output_ready.pop();
--    pthread_mutex_unlock(&m_output_mutex);
--
-+    {
-+      CSingleLock lock(m_output_mutex);
-+      buffer = m_output_ready.front();
-+      m_output_ready.pop();
-+      m_output_cond.notifyAll();
-+    }
-     assert(buffer->mmal_buffer);
-     memset(pDvdVideoPicture, 0, sizeof *pDvdVideoPicture);
-     pDvdVideoPicture->format = RENDER_FMT_MMAL;
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index 37d0868..ca28c6f 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -105,7 +105,8 @@ protected:
-   const char        *m_pFormatName;
- 
-   // mmal output buffers (video frames)
--  pthread_mutex_t   m_output_mutex;
-+  CCriticalSection m_output_mutex;
-+  XbmcThreads::ConditionVariable m_output_cond;
-   std::queue<CMMALVideoBuffer*> m_output_ready;
- 
-   // initialize mmal and get decoder component
-
-From ec6e9acc113651fc3408c9fc32d188f41d8de64a Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 16 Jan 2016 16:46:03 +0000
-Subject: [PATCH 73/93] omxaudio: Avoid reporting a spurious cached value
-
-Avoids seek bar showing zero after a seek
----
- xbmc/cores/omxplayer/OMXAudio.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXAudio.cpp b/xbmc/cores/omxplayer/OMXAudio.cpp
-index 70d0866..052b5ef 100644
---- a/xbmc/cores/omxplayer/OMXAudio.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudio.cpp
-@@ -1335,7 +1335,7 @@ float COMXAudio::GetDelay()
-   if (m_last_pts != DVD_NOPTS_VALUE && m_av_clock)
-     stamp = m_av_clock->OMXMediaTime();
-   // if possible the delay is current media time - time of last submitted packet
--  if (stamp != DVD_NOPTS_VALUE)
-+  if (stamp != DVD_NOPTS_VALUE && stamp != 0.0)
-   {
-     ret = (m_last_pts - stamp) * (1.0 / DVD_TIME_BASE);
-   }
-
-From 711b4b11b49c9ebc255e565462e3ac665a1cda8c Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 21 Dec 2015 22:17:25 +0000
-Subject: [PATCH 74/93] [omximage] Fall back to arm jpeg encode/decode when gpu
- is busy
-
----
- xbmc/cores/omxplayer/OMXImage.cpp | 50 ++++++++++++++++++++++++++++++++-------
- xbmc/cores/omxplayer/OMXImage.h   |  7 ++++++
- 2 files changed, 48 insertions(+), 9 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXImage.cpp b/xbmc/cores/omxplayer/OMXImage.cpp
-index a01c435..e592989 100644
---- a/xbmc/cores/omxplayer/OMXImage.cpp
-+++ b/xbmc/cores/omxplayer/OMXImage.cpp
-@@ -56,12 +56,17 @@ static XbmcThreads::ConditionVariable g_count_cond;
- static CCriticalSection               g_count_lock;
- static int g_count_val;
- 
--static void limit_calls_enter()
-+static bool limit_calls_enter()
- {
-   CSingleLock lock(g_count_lock);
-+  // on Pi2 fall back to arm decode if the queue is getting big
-+  if (g_RBP.RasberryPiVersion() > 1 && g_count_val >= 2)
-+    return false;
-+
-   while (g_count_val >= 3)
-     g_count_cond.wait(lock);
-   g_count_val++;
-+  return true;
- }
- 
- static void limit_calls_leave()
-@@ -112,6 +117,9 @@ bool COMXImage::CreateThumbnailFromSurface(unsigned char* buffer, unsigned int w
-       unsigned int format, unsigned int pitch, const std::string& destFile)
- {
-   COMXImageEnc omxImageEnc;
-+  if (!omxImageEnc.Gpu())
-+    return false;
-+
-   bool ret = omxImageEnc.CreateThumbnailFromSurface(buffer, width, height, format, pitch, destFile);
-   if (!ret)
-     CLog::Log(LOGNOTICE, "%s: unable to create thumbnail %s %dx%d", __func__, destFile.c_str(), width, height);
-@@ -205,6 +213,8 @@ bool COMXImage::CreateThumb(const std::string& srcFile, unsigned int maxHeight,
-   bool okay = false;
-   COMXImageFile file;
-   COMXImageReEnc reenc;
-+  if (!reenc.Gpu())
-+    return false;
-   void *pDestBuffer;
-   unsigned int nDestSize;
-   int orientation = additional_info == "flipped" ? 1:0;
-@@ -310,6 +320,9 @@ bool COMXImage::DecodeJpegToTexture(COMXImageFile *file, unsigned int width, uns
-   bool ret = false;
-   COMXTexture omx_image;
- 
-+  if (!omx_image.Gpu())
-+    return false;
-+
-   struct textureinfo *tex = new struct textureinfo;
-   if (!tex)
-     return NULL;
-@@ -924,7 +937,7 @@ bool COMXImageFile::ReadFile(const std::string& inputFile, int orientation)
- 
- COMXImageDec::COMXImageDec()
- {
--  limit_calls_enter();
-+  m_gpu = limit_calls_enter();
-   m_decoded_buffer = NULL;
-   OMX_INIT_STRUCTURE(m_decoded_format);
-   m_success = false;
-@@ -936,7 +949,8 @@ COMXImageDec::~COMXImageDec()
- 
-   OMX_INIT_STRUCTURE(m_decoded_format);
-   m_decoded_buffer = NULL;
--  limit_calls_leave();
-+  if (m_gpu)
-+    limit_calls_leave();
- }
- 
- void COMXImageDec::Close()
-@@ -1086,6 +1100,9 @@ bool COMXImageDec::HandlePortSettingChange(unsigned int resize_width, unsigned i
- 
- bool COMXImageDec::Decode(const uint8_t *demuxer_content, unsigned demuxer_bytes, unsigned width, unsigned height, unsigned stride, void *pixels)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   CSingleLock lock(m_OMXSection);
-   OMX_ERRORTYPE omx_err = OMX_ErrorNone;
-   OMX_BUFFERHEADERTYPE *omx_buffer = NULL;
-@@ -1223,7 +1240,7 @@ bool COMXImageDec::Decode(const uint8_t *demuxer_content, unsigned demuxer_bytes
- 
- COMXImageEnc::COMXImageEnc()
- {
--  limit_calls_enter();
-+  m_gpu = limit_calls_enter();
-   CSingleLock lock(m_OMXSection);
-   OMX_INIT_STRUCTURE(m_encoded_format);
-   m_encoded_buffer = NULL;
-@@ -1247,11 +1264,15 @@ COMXImageEnc::~COMXImageEnc()
-       m_omx_encoder.Deinitialize();
-     }
-   }
--  limit_calls_leave();
-+  if (m_gpu)
-+    limit_calls_leave();
- }
- 
- bool COMXImageEnc::Encode(unsigned char *buffer, int size, unsigned width, unsigned height, unsigned int pitch)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   CSingleLock lock(m_OMXSection);
- 
-   unsigned int demuxer_bytes = 0;
-@@ -1432,6 +1453,9 @@ bool COMXImageEnc::Encode(unsigned char *buffer, int size, unsigned width, unsig
- bool COMXImageEnc::CreateThumbnailFromSurface(unsigned char* buffer, unsigned int width, unsigned int height,
-     unsigned int format, unsigned int pitch, const std::string& destFile)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   if(format != XB_FMT_A8R8G8B8 || !buffer)
-   {
-     CLog::Log(LOGDEBUG, "%s::%s : %s failed format=0x%x\n", CLASSNAME, __func__, destFile.c_str(), format);
-@@ -1465,7 +1489,7 @@ bool COMXImageEnc::CreateThumbnailFromSurface(unsigned char* buffer, unsigned in
- 
- COMXImageReEnc::COMXImageReEnc()
- {
--  limit_calls_enter();
-+  m_gpu = limit_calls_enter();
-   m_encoded_buffer = NULL;
-   m_pDestBuffer = NULL;
-   m_nDestAllocSize = 0;
-@@ -1479,7 +1503,8 @@ COMXImageReEnc::~COMXImageReEnc()
-     free (m_pDestBuffer);
-   m_pDestBuffer = NULL;
-   m_nDestAllocSize = 0;
--  limit_calls_leave();
-+  if (m_gpu)
-+    limit_calls_leave();
- }
- 
- void COMXImageReEnc::Close()
-@@ -1771,6 +1796,9 @@ bool COMXImageReEnc::HandlePortSettingChange(unsigned int resize_width, unsigned
- 
- bool COMXImageReEnc::ReEncode(COMXImageFile &srcFile, unsigned int maxWidth, unsigned int maxHeight, void * &pDestBuffer, unsigned int &nDestSize)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   CSingleLock lock(m_OMXSection);
-   OMX_ERRORTYPE omx_err = OMX_ErrorNone;
- 
-@@ -1943,14 +1971,15 @@ bool COMXImageReEnc::ReEncode(COMXImageFile &srcFile, unsigned int maxWidth, uns
- 
- COMXTexture::COMXTexture()
- {
--  limit_calls_enter();
-+  m_gpu = limit_calls_enter();
-   m_success = false;
- }
- 
- COMXTexture::~COMXTexture()
- {
-   Close();
--  limit_calls_leave();
-+  if (m_gpu)
-+    limit_calls_leave();
- }
- 
- void COMXTexture::Close()
-@@ -2134,6 +2163,9 @@ bool COMXTexture::HandlePortSettingChange(unsigned int resize_width, unsigned in
- 
- bool COMXTexture::Decode(const uint8_t *demuxer_content, unsigned demuxer_bytes, unsigned int width, unsigned int height, void *egl_image)
- {
-+  if (!m_gpu)
-+    return false;
-+
-   CSingleLock lock(m_OMXSection);
-   OMX_ERRORTYPE omx_err = OMX_ErrorNone;
- 
-diff --git a/xbmc/cores/omxplayer/OMXImage.h b/xbmc/cores/omxplayer/OMXImage.h
-index a93aa82..6f38dbc 100644
---- a/xbmc/cores/omxplayer/OMXImage.h
-+++ b/xbmc/cores/omxplayer/OMXImage.h
-@@ -133,6 +133,7 @@ protected:
-   OMX_PARAM_PORTDEFINITIONTYPE  m_decoded_format;
-   CCriticalSection              m_OMXSection;
-   bool                          m_success;
-+  bool                          m_gpu;
- };
- 
- class COMXImageEnc
-@@ -144,6 +145,7 @@ public:
-   // Required overrides
-   bool CreateThumbnailFromSurface(unsigned char* buffer, unsigned int width, unsigned int height,
-       unsigned int format, unsigned int pitch, const std::string& destFile);
-+  bool Gpu() { return m_gpu; }
- protected:
-   bool Encode(unsigned char *buffer, int size, unsigned int width, unsigned int height, unsigned int pitch);
-   // Components
-@@ -152,6 +154,7 @@ protected:
-   OMX_PARAM_PORTDEFINITIONTYPE  m_encoded_format;
-   CCriticalSection              m_OMXSection;
-   bool                          m_success;
-+  bool                          m_gpu;
- };
- 
- class COMXImageReEnc
-@@ -163,6 +166,7 @@ public:
-   // Required overrides
-   void Close();
-   bool ReEncode(COMXImageFile &srcFile, unsigned int width, unsigned int height, void * &pDestBuffer, unsigned int &nDestSize);
-+  bool Gpu() { return m_gpu; }
- protected:
-   bool HandlePortSettingChange(unsigned int resize_width, unsigned int resize_height, int orientation, bool port_settings_changed);
-   // Components
-@@ -176,6 +180,7 @@ protected:
-   void                          *m_pDestBuffer;
-   unsigned int                  m_nDestAllocSize;
-   bool                          m_success;
-+  bool                          m_gpu;
- };
- 
- class COMXTexture
-@@ -187,6 +192,7 @@ public:
-   // Required overrides
-   void Close(void);
-   bool Decode(const uint8_t *data, unsigned size, unsigned int width, unsigned int height, void *egl_image);
-+  bool Gpu() { return m_gpu; }
- protected:
-   bool HandlePortSettingChange(unsigned int resize_width, unsigned int resize_height, void *egl_image, bool port_settings_changed);
- 
-@@ -201,6 +207,7 @@ protected:
-   OMX_BUFFERHEADERTYPE *m_egl_buffer;
-   CCriticalSection              m_OMXSection;
-   bool              m_success;
-+  bool              m_gpu;
- };
- 
- extern COMXImage g_OMXImage;
-
-From dfb7b32bb3d8220a30ad67a26dfc388b4c4d9f43 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 2 Jan 2016 18:08:16 +0000
-Subject: [PATCH 75/93] [mmalcodec] Don't send zero sized extradata
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 3579966..0f5c1b7 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -501,7 +501,7 @@ bool CMMALVideo::SendCodecConfigData()
- {
-   CSingleLock lock(m_sharedSection);
-   MMAL_STATUS_T status;
--  if (!m_dec_input_pool)
-+  if (!m_dec_input_pool || !m_hints.extrasize)
-     return true;
-   // send code config data
-   MMAL_BUFFER_HEADER_T *buffer = mmal_queue_timedwait(m_dec_input_pool->queue, 500);
-
-From 374227275b47f31ca0cca887a12e5cce187cdd55 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 3 Jan 2016 19:12:16 +0000
-Subject: [PATCH 76/93] stereoscopicmanager: Ensure we don't have a stale value
- of videoplayer stereoscopic mode
-
----
- xbmc/guilib/StereoscopicsManager.cpp | 30 +++++++++++++++++++++---------
- xbmc/guilib/StereoscopicsManager.h   |  2 ++
- 2 files changed, 23 insertions(+), 9 deletions(-)
-
-diff --git a/xbmc/guilib/StereoscopicsManager.cpp b/xbmc/guilib/StereoscopicsManager.cpp
-index d9e0fa4..4942f01 100644
---- a/xbmc/guilib/StereoscopicsManager.cpp
-+++ b/xbmc/guilib/StereoscopicsManager.cpp
-@@ -140,12 +140,7 @@ void CStereoscopicsManager::SetStereoMode(const RENDER_STEREO_MODE &mode)
- 
-   // resolve automatic mode before applying
-   if (mode == RENDER_STEREO_MODE_AUTO)
--  {
--    if (g_infoManager.EvaluateBool("videoplayer.isstereoscopic"))
--      applyMode = GetStereoModeOfPlayingVideo();
--    else
--      applyMode = RENDER_STEREO_MODE_OFF;
--  }
-+    applyMode = GetStereoModeOfPlayingVideo();
- 
-   if (applyMode != currentMode && applyMode >= RENDER_STEREO_MODE_OFF)
-   {
-@@ -209,7 +204,7 @@ RENDER_STEREO_MODE CStereoscopicsManager::GetStereoModeByUserChoice(const std::s
- {
-   RENDER_STEREO_MODE mode = GetStereoMode();
-   // if no stereo mode is set already, suggest mode of current video by preselecting it
--  if (mode == RENDER_STEREO_MODE_OFF && g_infoManager.EvaluateBool("videoplayer.isstereoscopic"))
-+  if (mode == RENDER_STEREO_MODE_OFF)
-     mode = GetStereoModeOfPlayingVideo();
- 
-   CGUIDialogSelect* pDlgSelect = (CGUIDialogSelect*)g_windowManager.GetWindow(WINDOW_DIALOG_SELECT);
-@@ -254,8 +249,8 @@ RENDER_STEREO_MODE CStereoscopicsManager::GetStereoModeByUserChoice(const std::s
- RENDER_STEREO_MODE CStereoscopicsManager::GetStereoModeOfPlayingVideo(void)
- {
-   RENDER_STEREO_MODE mode = RENDER_STEREO_MODE_OFF;
-+  std::string playerMode = GetVideoStereoMode();
- 
--  std::string playerMode = g_infoManager.GetLabel(VIDEOPLAYER_STEREOSCOPIC_MODE);
-   if (!playerMode.empty())
-   {
-     int convertedMode = ConvertVideoToGuiStereoMode(playerMode);
-@@ -504,6 +499,23 @@ void CStereoscopicsManager::ApplyStereoMode(const RENDER_STEREO_MODE &mode, bool
-   }
- }
- 
-+std::string CStereoscopicsManager::GetVideoStereoMode()
-+{
-+  std::string playerMode;
-+  if (g_application.m_pPlayer->IsPlaying())
-+  {
-+    SPlayerVideoStreamInfo videoInfo;
-+    g_application.m_pPlayer->GetVideoStreamInfo(videoInfo);
-+    playerMode = videoInfo.stereoMode;
-+  }
-+  return playerMode;
-+}
-+
-+bool CStereoscopicsManager::IsVideoStereoscopic()
-+{
-+  return !GetVideoStereoMode().empty();
-+}
-+
- void CStereoscopicsManager::OnPlaybackStarted(void)
- {
-   STEREOSCOPIC_PLAYBACK_MODE playbackMode = (STEREOSCOPIC_PLAYBACK_MODE) CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_STEREOSCOPICPLAYBACKMODE);
-@@ -513,7 +525,7 @@ void CStereoscopicsManager::OnPlaybackStarted(void)
-   if (playbackMode == STEREOSCOPIC_PLAYBACK_MODE_IGNORE && mode == RENDER_STEREO_MODE_OFF)
-     return;
- 
--  if (!g_infoManager.EvaluateBool("videoplayer.isstereoscopic"))
-+  if (!CStereoscopicsManager::IsVideoStereoscopic())
-   {
-     // exit stereo mode if started item is not stereoscopic
-     // and if user prefers to stop 3D playback when movie is finished
-diff --git a/xbmc/guilib/StereoscopicsManager.h b/xbmc/guilib/StereoscopicsManager.h
-index ec2310f..f090bb9 100644
---- a/xbmc/guilib/StereoscopicsManager.h
-+++ b/xbmc/guilib/StereoscopicsManager.h
-@@ -92,6 +92,8 @@ private:
-   void ApplyStereoMode(const RENDER_STEREO_MODE &mode, bool notify = true);
-   void OnPlaybackStarted(void);
-   void OnPlaybackStopped(void);
-+  std::string GetVideoStereoMode();
-+  bool IsVideoStereoscopic();
- 
-   RENDER_STEREO_MODE m_stereoModeSetByUser;
-   RENDER_STEREO_MODE m_lastStereoModeSetByUser;
-
-From d33bcf6304a60bfbdbc993eadab11505ae675851 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 16 Sep 2015 19:05:12 +0100
-Subject: [PATCH 77/93] [3d] Make MVC a valid 3D filename tag
-
----
- xbmc/guilib/StereoscopicsManager.cpp | 9 +++++++++
- xbmc/settings/AdvancedSettings.cpp   | 2 ++
- xbmc/settings/AdvancedSettings.h     | 1 +
- 3 files changed, 12 insertions(+)
-
-diff --git a/xbmc/guilib/StereoscopicsManager.cpp b/xbmc/guilib/StereoscopicsManager.cpp
-index 4942f01..ff67d0d 100644
---- a/xbmc/guilib/StereoscopicsManager.cpp
-+++ b/xbmc/guilib/StereoscopicsManager.cpp
-@@ -197,6 +197,15 @@ std::string CStereoscopicsManager::DetectStereoModeByString(const std::string &n
-   if (re.RegFind(searchString) > -1)
-     stereoMode = "top_bottom";
- 
-+  if (!re.RegComp(g_advancedSettings.m_stereoscopicregex_mvc.c_str()))
-+  {
-+    CLog::Log(LOGERROR, "%s: Invalid RegExp for matching 3d MVC content:'%s'", __FUNCTION__, g_advancedSettings.m_stereoscopicregex_mvc.c_str());
-+    return stereoMode;
-+  }
-+
-+  if (re.RegFind(searchString) > -1)
-+    stereoMode = "left_right";
-+
-   return stereoMode;
- }
- 
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 8045a03..aeea13b 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -388,6 +388,7 @@ void CAdvancedSettings::Initialize()
-   m_stereoscopicregex_3d = "[-. _]3d[-. _]";
-   m_stereoscopicregex_sbs = "[-. _]h?sbs[-. _]";
-   m_stereoscopicregex_tab = "[-. _]h?tab[-. _]";
-+  m_stereoscopicregex_mvc = "[-. _]h?mvc[-. _]";
- 
-   m_videoAssFixedWorks = false;
- 
-@@ -514,6 +515,7 @@ void CAdvancedSettings::ParseSettingsFile(const std::string &file)
-     XMLUtils::GetString(pElement, "stereoscopicregex3d", m_stereoscopicregex_3d);
-     XMLUtils::GetString(pElement, "stereoscopicregexsbs", m_stereoscopicregex_sbs);
-     XMLUtils::GetString(pElement, "stereoscopicregextab", m_stereoscopicregex_tab);
-+    XMLUtils::GetString(pElement, "stereoscopicregexmvc", m_stereoscopicregex_mvc);
-     XMLUtils::GetFloat(pElement, "subsdelayrange", m_videoSubsDelayRange, 10, 600);
-     XMLUtils::GetFloat(pElement, "audiodelayrange", m_videoAudioDelayRange, 10, 600);
-     XMLUtils::GetInt(pElement, "blackbarcolour", m_videoBlackBarColour, 0, 255);
-diff --git a/xbmc/settings/AdvancedSettings.h b/xbmc/settings/AdvancedSettings.h
-index 93de9bd..4da88f4 100644
---- a/xbmc/settings/AdvancedSettings.h
-+++ b/xbmc/settings/AdvancedSettings.h
-@@ -386,6 +386,7 @@ class CAdvancedSettings : public ISettingCallback, public ISettingsHandler
-     std::string m_stereoscopicregex_3d;
-     std::string m_stereoscopicregex_sbs;
-     std::string m_stereoscopicregex_tab;
-+    std::string m_stereoscopicregex_mvc;
- 
-     /*!< @brief position behavior of ass subtitiles when setting "subtitle position on screen" set to "fixed"
-     True to show at the fixed position set in video calibration
-
-From ff22ccfba36a15d2ed383bf5543f2dd3b9c6a618 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sun, 24 Jan 2016 16:42:04 +0000
-Subject: [PATCH 78/93] fixup! [build] Add patches to ffmpeg for native build
-
----
- tools/depends/target/ffmpeg/autobuild.sh | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/tools/depends/target/ffmpeg/autobuild.sh b/tools/depends/target/ffmpeg/autobuild.sh
-index f6d4c3b..cc59d17 100755
---- a/tools/depends/target/ffmpeg/autobuild.sh
-+++ b/tools/depends/target/ffmpeg/autobuild.sh
-@@ -131,7 +131,7 @@ patch -p1 < ../0001-Discard-data-before-VO-VOL-in-mpeg-4-over-mpegts.patch
- patch -p1 < ../hevcdsp_ARM_NEON_optimized_epel_functions.patch
- patch -p1 < ../added_ARM_NEON_optimized_SAO_patches.patch
- patch -p1 < ../pfcd_hevc_optimisations.patch
--patch -p1 < ../add_h264_MVC_support.patch
-+patch -p1 < ../0001-Squashed-commit-of-the-following.patch
- 
- CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
- ./configure --prefix=$FFMPEG_PREFIX \
-
-From 8bcf9f72ff12412fdc4c8139be071c2448d51ae7 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 26 Jan 2016 19:58:30 +0000
-Subject: [PATCH 79/93] OMXAudio: Make use of m_bGotFrame to skip decoding when
- full
-
----
- xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-index 33c4c6a..b9dab89 100644
---- a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-@@ -158,7 +158,8 @@ int COMXAudioCodecOMX::Decode(BYTE* pData, int iSize, double dts, double pts)
-   if (!m_pCodecContext) return -1;
- 
-   AVPacket avpkt;
--  m_bGotFrame = false;
-+  if (m_bGotFrame)
-+    return 0;
-   av_init_packet(&avpkt);
-   avpkt.data = pData;
-   avpkt.size = iSize;
-@@ -257,6 +258,7 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-       outputSize = 0;
-     }
-   }
-+  m_bGotFrame = false;
-   int desired_size = AUDIO_DECODE_OUTPUT_BUFFER * (m_pCodecContext->channels * GetBitsPerSample()) >> (rounded_up_channels_shift[m_pCodecContext->channels] + 4);
- 
-   if (m_bFirstFrame)
-@@ -274,7 +276,6 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-   if (m_iBufferOutputUsed + outputSize > desired_size || m_bNoConcatenate)
-   {
-      int ret = m_iBufferOutputUsed;
--     m_bGotFrame = false;
-      m_iBufferOutputUsed = 0;
-      dts = m_dts;
-      pts = m_pts;
-
-From 7044ba837edb2060a28bf534f5327d90e1c545e5 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 26 Jan 2016 20:01:18 +0000
-Subject: [PATCH 80/93] OMXAudio: Handle GetData before adding the next buffer
- so we can be sure it fits
-
----
- xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp | 28 ++++++++++++++--------------
- 1 file changed, 14 insertions(+), 14 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-index b9dab89..f150dc6 100644
---- a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-@@ -206,12 +206,24 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-   /* output audio will be packed */
-   int outputSize = av_samples_get_buffer_size(&outLineSize, m_pCodecContext->channels, m_pFrame1->nb_samples, m_desiredSampleFormat, 1);
- 
-+  // if this buffer won't fit then flush out what we have
-+  int desired_size = AUDIO_DECODE_OUTPUT_BUFFER * (m_pCodecContext->channels * GetBitsPerSample()) >> (rounded_up_channels_shift[m_pCodecContext->channels] + 4);
-+  if (m_iBufferOutputUsed && (m_iBufferOutputUsed + outputSize > desired_size || m_bNoConcatenate))
-+  {
-+     int ret = m_iBufferOutputUsed;
-+     m_iBufferOutputUsed = 0;
-+     dts = m_dts;
-+     pts = m_pts;
-+     *dst = m_pBufferOutput;
-+     return ret;
-+  }
-+  m_frameSize = outputSize;
-+
-   if (m_iBufferOutputAlloced < m_iBufferOutputUsed + outputSize)
-   {
-      m_pBufferOutput = (BYTE*)av_realloc(m_pBufferOutput, m_iBufferOutputUsed + outputSize + FF_INPUT_BUFFER_PADDING_SIZE);
-      m_iBufferOutputAlloced = m_iBufferOutputUsed + outputSize;
-   }
--  *dst = m_pBufferOutput;
- 
-   /* need to convert format */
-   if(m_pCodecContext->sample_fmt != m_desiredSampleFormat)
-@@ -259,28 +271,16 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-     }
-   }
-   m_bGotFrame = false;
--  int desired_size = AUDIO_DECODE_OUTPUT_BUFFER * (m_pCodecContext->channels * GetBitsPerSample()) >> (rounded_up_channels_shift[m_pCodecContext->channels] + 4);
- 
-   if (m_bFirstFrame)
-   {
--    CLog::Log(LOGDEBUG, "COMXAudioCodecOMX::GetData size=%d/%d line=%d/%d buf=%p, desired=%d", inputSize, outputSize, inLineSize, outLineSize, *dst, desired_size);
-+    CLog::Log(LOGDEBUG, "COMXAudioCodecOMX::GetData size=%d/%d line=%d/%d buf=%p, desired=%d", inputSize, outputSize, inLineSize, outLineSize, m_pBufferOutput, desired_size);
-     m_bFirstFrame = false;
-   }
-   m_iBufferOutputUsed += outputSize;
- 
-   if (!m_bNoConcatenate && m_pCodecContext->sample_fmt == AV_SAMPLE_FMT_FLTP && m_frameSize && (int)m_frameSize != outputSize)
-     CLog::Log(LOGERROR, "COMXAudioCodecOMX::GetData Unexpected change of size (%d->%d)", m_frameSize, outputSize);
--  m_frameSize = outputSize;
--
--  // if next buffer submitted won't fit then flush it out
--  if (m_iBufferOutputUsed + outputSize > desired_size || m_bNoConcatenate)
--  {
--     int ret = m_iBufferOutputUsed;
--     m_iBufferOutputUsed = 0;
--     dts = m_dts;
--     pts = m_pts;
--     return ret;
--  }
-   return 0;
- }
- 
-
-From c4abc577bf4c12d48cc800930c2d292c0a65031f Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 26 Jan 2016 20:03:49 +0000
-Subject: [PATCH 81/93] OMXAudio: Handle changes in decoded audio size
- correctly
-
----
- xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp | 14 +++++++-------
- 1 file changed, 7 insertions(+), 7 deletions(-)
-
-diff --git a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-index f150dc6..4956b5b 100644
---- a/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-+++ b/xbmc/cores/omxplayer/OMXAudioCodecOMX.cpp
-@@ -103,10 +103,6 @@ bool COMXAudioCodecOMX::Open(CDVDStreamInfo &hints)
-   if (m_pCodecContext->request_channel_layout)
-     CLog::Log(LOGNOTICE,"COMXAudioCodecOMX::Open() Requesting channel layout of %x", (unsigned)m_pCodecContext->request_channel_layout);
- 
--  // vorbis and wma2v2 have variable sized planar output, so skip concatenation
--  if (hints.codec == AV_CODEC_ID_VORBIS || hints.codec == AV_CODEC_ID_WMAV2)
--    m_bNoConcatenate = true;
--
-   if(m_pCodecContext->bits_per_coded_sample == 0)
-     m_pCodecContext->bits_per_coded_sample = 16;
- 
-@@ -206,12 +202,19 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-   /* output audio will be packed */
-   int outputSize = av_samples_get_buffer_size(&outLineSize, m_pCodecContext->channels, m_pFrame1->nb_samples, m_desiredSampleFormat, 1);
- 
-+  if (!m_bNoConcatenate && m_iBufferOutputUsed && (int)m_frameSize != outputSize)
-+  {
-+    CLog::Log(LOGERROR, "COMXAudioCodecOMX::GetData Unexpected change of size (%d->%d)", m_frameSize, outputSize);
-+    m_bNoConcatenate = true;
-+  }
-+
-   // if this buffer won't fit then flush out what we have
-   int desired_size = AUDIO_DECODE_OUTPUT_BUFFER * (m_pCodecContext->channels * GetBitsPerSample()) >> (rounded_up_channels_shift[m_pCodecContext->channels] + 4);
-   if (m_iBufferOutputUsed && (m_iBufferOutputUsed + outputSize > desired_size || m_bNoConcatenate))
-   {
-      int ret = m_iBufferOutputUsed;
-      m_iBufferOutputUsed = 0;
-+     m_bNoConcatenate = false;
-      dts = m_dts;
-      pts = m_pts;
-      *dst = m_pBufferOutput;
-@@ -278,9 +281,6 @@ int COMXAudioCodecOMX::GetData(BYTE** dst, double &dts, double &pts)
-     m_bFirstFrame = false;
-   }
-   m_iBufferOutputUsed += outputSize;
--
--  if (!m_bNoConcatenate && m_pCodecContext->sample_fmt == AV_SAMPLE_FMT_FLTP && m_frameSize && (int)m_frameSize != outputSize)
--    CLog::Log(LOGERROR, "COMXAudioCodecOMX::GetData Unexpected change of size (%d->%d)", m_frameSize, outputSize);
-   return 0;
- }
- 
-
-From df922f986ab0b2dc1363224ef6c72a7a8ac616dc Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 4 Feb 2016 15:29:55 +0000
-Subject: [PATCH 82/93] MMALCodec: Set dropped flag on output pictures when
- input requested that
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 6 ++++++
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h   | 1 +
- 2 files changed, 7 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 0f5c1b7..7b025fd 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -759,6 +759,7 @@ void CMMALVideo::SetDropState(bool bDrop)
- {
-   if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-     CLog::Log(LOGDEBUG, "%s::%s - bDrop(%d)", CLASSNAME, __func__, bDrop);
-+  m_dropState = bDrop;
- }
- 
- int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-@@ -794,6 +795,8 @@ int CMMALVideo::Decode(uint8_t* pData, int iSize, double dts, double pts)
-        buffer->length = (uint32_t)iSize > buffer->alloc_size ? buffer->alloc_size : (uint32_t)iSize;
-        // set a flag so we can identify primary frames from generated frames (deinterlace)
-        buffer->flags = MMAL_BUFFER_HEADER_FLAG_USER0;
-+       if (m_dropState)
-+         buffer->flags |= MMAL_BUFFER_HEADER_FLAG_USER3;
- 
-        memcpy(buffer->data, pData, buffer->length);
-        iSize -= buffer->length;
-@@ -944,6 +947,7 @@ void CMMALVideo::Reset(void)
-   m_decoderPts = DVD_NOPTS_VALUE;
-   m_demuxerPts = DVD_NOPTS_VALUE;
-   m_codecControlFlags = 0;
-+  m_dropState = false;
- }
- 
- void CMMALVideo::SetSpeed(int iSpeed)
-@@ -1021,6 +1025,8 @@ bool CMMALVideo::GetPicture(DVDVideoPicture* pDvdVideoPicture)
- 
-     pDvdVideoPicture->MMALBuffer->Acquire();
-     pDvdVideoPicture->iFlags  = DVP_FLAG_ALLOCATED;
-+    if (buffer->mmal_buffer->flags & MMAL_BUFFER_HEADER_FLAG_USER3)
-+      pDvdVideoPicture->iFlags |= DVP_FLAG_DROPPED;
-     if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-       CLog::Log(LOGINFO, "%s::%s dts:%.3f pts:%.3f flags:%x:%x MMALBuffer:%p mmal_buffer:%p", CLASSNAME, __func__,
-           pDvdVideoPicture->dts == DVD_NOPTS_VALUE ? 0.0 : pDvdVideoPicture->dts*1e-6, pDvdVideoPicture->pts == DVD_NOPTS_VALUE ? 0.0 : pDvdVideoPicture->pts*1e-6,
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-index ca28c6f..bf669e0 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.h
-@@ -124,6 +124,7 @@ protected:
-   double            m_decoderPts;
-   int               m_speed;
-   int               m_codecControlFlags;
-+  bool              m_dropState;
- 
-   CCriticalSection m_sharedSection;
-   MMAL_COMPONENT_T *m_dec;
-
-From a6102bec84b610166da7448d80b853e5efd649a1 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 3 Feb 2016 21:35:01 +0000
-Subject: [PATCH 83/93] DVDVideoCodecFFmpeg: Enable refcounted frames
-
-Without this frames will get (deep) copied when deinterlace is set to automatic,
-but file is not deinterlaced.
-
-For 1080p24 that costs 150MB/s of memory bandwidth which is very expensive.
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-index c2f3287..64087f2 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
-@@ -316,6 +316,10 @@ bool CDVDVideoCodecFFmpeg::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options
-       av_opt_set(m_pCodecContext, it->m_name.c_str(), it->m_value.c_str(), 0);
-   }
- 
-+  // If non-zero, the decoded audio and video frames returned from avcodec_decode_video2() are reference-counted and are valid indefinitely.
-+  // Without this frames will get (deep) copied when deinterlace is set to automatic, but file is not deinterlaced.
-+  m_pCodecContext->refcounted_frames = 1;
-+
-   if (avcodec_open2(m_pCodecContext, pCodec, NULL) < 0)
-   {
-     CLog::Log(LOGDEBUG,"CDVDVideoCodecFFmpeg::Open() Unable to open codec");
-
-From a301f546dcfa4bf1ceaa9737a60a835826d54fec Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 19 Feb 2016 13:45:23 +0000
-Subject: [PATCH 84/93] mmal: increase decode buffering a little to help harder
- MVC files
-
-PR8610 reduced buffering in codec which generally improved behaviour,
-but we have some reports of hard streams (like 3D BluRay) that now
-lag. The problem is when the codec's input buffer runs dry you
-waste useful decoder cycles. It seems adding another two frames of
-latency to decoder gets the performance back.
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-index 7b025fd..08f61fc 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/MMALCodec.cpp
-@@ -676,7 +676,7 @@ bool CMMALVideo::Open(CDVDStreamInfo &hints, CDVDCodecOptions &options)
- 
-   // limit number of callback structures in video_decode to reduce latency. Too low and video hangs.
-   // negative numbers have special meaning. -1=size of DPB -2=size of DPB+1
--  status = mmal_port_parameter_set_uint32(m_dec_input, MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS, -3);
-+  status = mmal_port_parameter_set_uint32(m_dec_input, MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS, -5);
-   if (status != MMAL_SUCCESS)
-     CLog::Log(LOGERROR, "%s::%s Failed to configure max num callbacks on %s (status=%x %s)", CLASSNAME, __func__, m_dec_input->name, status, mmal_status_to_string(status));
- 
-
-From c0b0aad15a9ffcd921bb70af5bf9200ee7a93fed Mon Sep 17 00:00:00 2001
-From: Mario Holzinger <sandman01xda@gmail.com>
-Date: Fri, 11 Dec 2015 16:48:57 +0100
-Subject: [PATCH 85/93] touch panel to display adjustment
-
----
- xbmc/input/linux/LinuxInputDevices.cpp |  7 ++++---
- xbmc/settings/AdvancedSettings.cpp     | 16 ++++++++++++++++
- xbmc/settings/AdvancedSettings.h       |  6 ++++++
- 3 files changed, 26 insertions(+), 3 deletions(-)
-
-diff --git a/xbmc/input/linux/LinuxInputDevices.cpp b/xbmc/input/linux/LinuxInputDevices.cpp
-index 9f3e866..a506956 100644
---- a/xbmc/input/linux/LinuxInputDevices.cpp
-+++ b/xbmc/input/linux/LinuxInputDevices.cpp
-@@ -98,6 +98,7 @@ typedef unsigned long kernel_ulong_t;
- #include "utils/log.h"
- #include "input/touch/generic/GenericTouchActionHandler.h"
- #include "input/touch/generic/GenericTouchInputHandler.h"
-+#include "settings/AdvancedSettings.h"
- 
- #ifndef BITS_PER_LONG
- #define BITS_PER_LONG        (sizeof(long) * 8)
-@@ -653,13 +654,13 @@ bool CLinuxInputDevice::AbsEvent(const struct input_event& levt, XBMC_Event& dev
-   switch (levt.code)
-   {
-   case ABS_X:
--    m_mouseX = levt.value;
-+    m_mouseX = (int)((float)levt.value * g_advancedSettings.m_screenAlign_xStretchFactor) + g_advancedSettings.m_screenAlign_xOffset; // stretch and shift touch x coordinates
-     break;
- 
-   case ABS_Y:
--    m_mouseY = levt.value;
-+    m_mouseY = (int)((float)levt.value * g_advancedSettings.m_screenAlign_yStretchFactor) + g_advancedSettings.m_screenAlign_yOffset; // stretch and shift touch y coordinates
-     break;
--  
-+
-   case ABS_MISC:
-     remoteStatus = levt.value & 0xFF;
-     break;
-diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index aeea13b..326e9f5 100644
---- a/xbmc/settings/AdvancedSettings.cpp
-+++ b/xbmc/settings/AdvancedSettings.cpp
-@@ -303,6 +303,12 @@ void CAdvancedSettings::Initialize()
-   m_iEdlCommBreakAutowait = 0;             // Off by default
-   m_iEdlCommBreakAutowind = 0;             // Off by default
- 
-+  // Touchscreen  default values if no adjustment is necessarry
-+  m_screenAlign_xOffset = 0;
-+  m_screenAlign_yOffset= 0;
-+  m_screenAlign_xStretchFactor = 1.0;
-+  m_screenAlign_yStretchFactor = 1.0;
-+
-   m_curlconnecttimeout = 10;
-   m_curllowspeedtime = 20;
-   m_curlretries = 2;
-@@ -871,6 +877,16 @@ void CAdvancedSettings::ParseSettingsFile(const std::string &file)
-     XMLUtils::GetInt(pElement, "commbreakautowind", m_iEdlCommBreakAutowind, 0, 10);        // Between 0 and 10 seconds
-   }
- 
-+  // Touchscreen
-+  pElement = pRootElement->FirstChildElement("touchscreen");
-+  if (pElement)
-+  {
-+    XMLUtils::GetInt(pElement, "x_offset", m_screenAlign_xOffset );
-+    XMLUtils::GetInt(pElement, "y_offset", m_screenAlign_yOffset );
-+    XMLUtils::GetFloat(pElement, "x_stretch_factor", m_screenAlign_xStretchFactor );
-+    XMLUtils::GetFloat(pElement, "y_stretch_factor", m_screenAlign_yStretchFactor );
-+  }
-+
-   // picture exclude regexps
-   TiXmlElement* pPictureExcludes = pRootElement->FirstChildElement("pictureexcludes");
-   if (pPictureExcludes)
-diff --git a/xbmc/settings/AdvancedSettings.h b/xbmc/settings/AdvancedSettings.h
-index 4da88f4..1bdc77e 100644
---- a/xbmc/settings/AdvancedSettings.h
-+++ b/xbmc/settings/AdvancedSettings.h
-@@ -334,6 +334,12 @@ class CAdvancedSettings : public ISettingCallback, public ISettingsHandler
-     std::string m_cpuTempCmd;
-     std::string m_gpuTempCmd;
- 
-+    // Touchscreen
-+    int m_screenAlign_xOffset;
-+    int m_screenAlign_yOffset;
-+    float m_screenAlign_xStretchFactor;
-+    float m_screenAlign_yStretchFactor;
-+
-     /* PVR/TV related advanced settings */
-     int m_iPVRTimeCorrection;     /*!< @brief correct all times (epg tags, timer tags, recording tags) by this amount of minutes. defaults to 0. */
-     int m_iPVRInfoToggleInterval; /*!< @brief if there are more than 1 pvr gui info item available (e.g. multiple recordings active at the same time), use this toggle delay in milliseconds. defaults to 3000. */
-
-From d3c755950fe7e7b255a2a28cafd105affb3aab13 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 15 Feb 2016 15:51:11 +0000
-Subject: [PATCH 86/93] touch panel to display adjustment - support multitouch
-
-PR7978 allowed a simple calibration to be applied to touch input
-It didn't cover multitouch which this adds support for
----
- xbmc/input/linux/LinuxInputDevices.cpp | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/xbmc/input/linux/LinuxInputDevices.cpp b/xbmc/input/linux/LinuxInputDevices.cpp
-index a506956..3579cd0 100644
---- a/xbmc/input/linux/LinuxInputDevices.cpp
-+++ b/xbmc/input/linux/LinuxInputDevices.cpp
-@@ -707,7 +707,7 @@ bool CLinuxInputDevice::mtAbsEvent(const struct input_event& levt)
-   case ABS_MT_POSITION_X:
-     if (m_mt_currentSlot < TOUCH_MAX_POINTERS)
-     {
--      m_mt_x[m_mt_currentSlot] = levt.value;
-+      m_mt_x[m_mt_currentSlot] = (int)((float)levt.value * g_advancedSettings.m_screenAlign_xStretchFactor) + g_advancedSettings.m_screenAlign_xOffset; // stretch and shift touch x coordinates
-       if (m_mt_event[m_mt_currentSlot] == TouchInputUnchanged)
-         m_mt_event[m_mt_currentSlot] = TouchInputMove;
-     }
-@@ -716,7 +716,7 @@ bool CLinuxInputDevice::mtAbsEvent(const struct input_event& levt)
-   case ABS_MT_POSITION_Y:
-     if (m_mt_currentSlot < TOUCH_MAX_POINTERS)
-     {
--      m_mt_y[m_mt_currentSlot] = levt.value;
-+      m_mt_y[m_mt_currentSlot] = (int)((float)levt.value * g_advancedSettings.m_screenAlign_yStretchFactor) + g_advancedSettings.m_screenAlign_yOffset; // stretch and shift touch y coordinates;
-       if (m_mt_event[m_mt_currentSlot] == TouchInputUnchanged)
-         m_mt_event[m_mt_currentSlot] = TouchInputMove;
-     }
-
-From f517a6ff4ab7f04b9a6ba371d3429e5ae95eb3d1 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 22 Mar 2016 22:28:13 +0000
-Subject: [PATCH 87/93] [linux] Move hotplug checking into its own thread
-
-Currently checking for new linux input devices is called from the rendering thread.
-We've been getting reports of skipped frames on raspberry pi.
-
-Specifically if eventlirc is active and you have an LIRC capable device connected
-the hotplug check is slow and you get a frame skip every ten seconds.
-
-So move this code into its own thread
----
- xbmc/input/linux/LinuxInputDevices.cpp | 54 +++++++++++++++++++++-------------
- xbmc/input/linux/LinuxInputDevices.h   | 14 ++++++++-
- xbmc/windowing/WinEventsLinux.cpp      |  1 +
- xbmc/windowing/WinEventsLinux.h        |  2 ++
- 4 files changed, 49 insertions(+), 22 deletions(-)
-
-diff --git a/xbmc/input/linux/LinuxInputDevices.cpp b/xbmc/input/linux/LinuxInputDevices.cpp
-index 3579cd0..7b3c6ad 100644
---- a/xbmc/input/linux/LinuxInputDevices.cpp
-+++ b/xbmc/input/linux/LinuxInputDevices.cpp
-@@ -1081,6 +1081,30 @@ bool CLinuxInputDevice::IsUnplugged()
-   return m_bUnplugged;
- }
- 
-+CLinuxInputDevicesCheckHotplugged::CLinuxInputDevicesCheckHotplugged(CLinuxInputDevices &parent) :
-+    CThread("CLinuxInputDevicesCheckHotplugged"), m_parent(parent)
-+{
-+  Create();
-+  SetPriority(THREAD_PRIORITY_BELOW_NORMAL);
-+}
-+
-+CLinuxInputDevicesCheckHotplugged::~CLinuxInputDevicesCheckHotplugged()
-+{
-+  m_bStop = true;
-+  m_quitEvent.Set();
-+  StopThread(true);
-+}
-+
-+void CLinuxInputDevicesCheckHotplugged::Process()
-+{
-+  while (!m_bStop)
-+  {
-+    m_parent.CheckHotplugged();
-+    // every ten seconds
-+    m_quitEvent.WaitMSec(10000);
-+  }
-+}
-+
- bool CLinuxInputDevices::CheckDevice(const char *device)
- {
-   int fd;
-@@ -1147,10 +1171,6 @@ void CLinuxInputDevices::InitAvailable()
-  */
- void CLinuxInputDevices::CheckHotplugged()
- {
--  CSingleLock lock(m_devicesListLock);
--
--  int deviceId = m_devices.size();
--
-   /* No devices specified. Try to guess some. */
-   for (int i = 0; i < MAX_LINUX_INPUT_DEVICES; i++)
-   {
-@@ -1158,18 +1178,22 @@ void CLinuxInputDevices::CheckHotplugged()
-     bool ispresent = false;
- 
-     snprintf(buf, 32, "/dev/input/event%d", i);
--
--    for (size_t j = 0; j < m_devices.size(); j++)
-     {
--      if (m_devices[j]->GetFileName().compare(buf) == 0)
-+      CSingleLock lock(m_devicesListLock);
-+      for (size_t j = 0; j < m_devices.size(); j++)
-       {
--        ispresent = true;
--        break;
-+        if (m_devices[j]->GetFileName().compare(buf) == 0)
-+        {
-+          ispresent = true;
-+          break;
-+        }
-       }
-     }
- 
-     if (!ispresent && CheckDevice(buf))
-     {
-+      CSingleLock lock(m_devicesListLock);
-+      int deviceId = m_devices.size();
-       CLog::Log(LOGINFO, "Found input device %s", buf);
-       m_devices.push_back(new CLinuxInputDevice(buf, deviceId));
-       ++deviceId;
-@@ -1360,18 +1384,6 @@ XBMC_Event CLinuxInputDevices::ReadEvent()
-     InitAvailable();
-     m_bReInitialize = false;
-   }
--  else
--  {
--    time_t now;
--    time(&now);
--
--    if ((now - m_lastHotplugCheck) >= 10)
--    {
--      CheckHotplugged();
--      m_lastHotplugCheck = now;
--    }
--  }
--
-   CSingleLock lock(m_devicesListLock);
- 
-   XBMC_Event event;
-diff --git a/xbmc/input/linux/LinuxInputDevices.h b/xbmc/input/linux/LinuxInputDevices.h
-index 8c88a1d..4fde1aa 100644
---- a/xbmc/input/linux/LinuxInputDevices.h
-+++ b/xbmc/input/linux/LinuxInputDevices.h
-@@ -28,6 +28,7 @@
- #include "threads/SingleLock.h"
- #include "input/touch/ITouchInputHandler.h"
- #include "input/touch/generic/IGenericTouchGestureDetector.h"
-+#include "threads/Thread.h"
- 
- struct KeymapEntry
- {
-@@ -105,7 +106,18 @@ private:
-   bool CheckDevice(const char *device);
-   std::vector<CLinuxInputDevice*> m_devices;
-   bool m_bReInitialize;
--  time_t m_lastHotplugCheck;
-+};
-+
-+class CLinuxInputDevicesCheckHotplugged : protected CThread
-+{
-+public:
-+  CLinuxInputDevicesCheckHotplugged(CLinuxInputDevices &parent);
-+  ~CLinuxInputDevicesCheckHotplugged();
-+private:
-+  CLinuxInputDevices &m_parent;
-+  CEvent m_quitEvent;
-+protected:
-+  virtual void Process();
- };
- 
- #endif /* LINUXINPUTDEVICES_H_ */
-diff --git a/xbmc/windowing/WinEventsLinux.cpp b/xbmc/windowing/WinEventsLinux.cpp
-index 2b3d77a..fb6c987 100644
---- a/xbmc/windowing/WinEventsLinux.cpp
-+++ b/xbmc/windowing/WinEventsLinux.cpp
-@@ -134,6 +134,7 @@ bool CWinEventsLinux::MessagePump()
-   if (!m_initialized)
-   {
-     m_devices.InitAvailable();
-+    m_checkHotplug = std::unique_ptr<CLinuxInputDevicesCheckHotplugged>(new CLinuxInputDevicesCheckHotplugged(m_devices));
-     m_initialized = true;
- #ifdef TARGET_RASPBERRY_PI
-     LoadXML("Pointer.xml");
-diff --git a/xbmc/windowing/WinEventsLinux.h b/xbmc/windowing/WinEventsLinux.h
-index 23244a2..c82ba84 100644
---- a/xbmc/windowing/WinEventsLinux.h
-+++ b/xbmc/windowing/WinEventsLinux.h
-@@ -22,6 +22,7 @@
- #define WINDOW_EVENTS_LINUX_H
- 
- #pragma once
-+#include <memory>
- #include "windowing/WinEvents.h"
- #include "input/linux/LinuxInputDevices.h"
- #include "guilib/TextureManager.h"
-@@ -44,6 +45,7 @@ public:
- private:
-   static bool m_initialized;
-   static CLinuxInputDevices m_devices;
-+  std::unique_ptr<CLinuxInputDevicesCheckHotplugged> m_checkHotplug;
- #ifdef TARGET_RASPBERRY_PI
-   bool LoadXML(const std::string strFileName);
-   int64_t m_last_mouse_move_time;
-
-From ffb8b5378dbb2c53f1411e051f0c7eec9555ca83 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 28 Jul 2015 17:47:33 +0100
-Subject: [PATCH 88/93] [rbp] Refactor the vsync handle to support multiple
- callers
-
----
- xbmc/linux/RBP.cpp | 100 ++++++++++++++++++++++++++++++++++-------------------
- xbmc/linux/RBP.h   |  10 ++++--
- 2 files changed, 73 insertions(+), 37 deletions(-)
-
-diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
-index 13b0504..ddc2b9c 100644
---- a/xbmc/linux/RBP.cpp
-+++ b/xbmc/linux/RBP.cpp
-@@ -34,6 +34,7 @@
- #include <sys/ioctl.h>
- #include <linux/ioctl.h>
- #include "rpi_user_vcsm.h"
-+#include "utils/TimeUtils.h"
- 
- #define MAJOR_NUM 100
- #define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-@@ -56,6 +57,8 @@ CRBP::CRBP()
-   m_enabled = 0;
-   m_mb = mbox_open();
-   vcsm_init();
-+  m_vsync_count = 0;
-+  m_last_vsync = 0;
- }
- 
- CRBP::~CRBP()
-@@ -73,7 +76,7 @@ void CRBP::InitializeSettings()
- 
- bool CRBP::Initialize()
- {
--  CSingleLock lock (m_critSection);
-+  CSingleLock lock(m_critSection);
-   if (m_initialized)
-     return true;
- 
-@@ -133,11 +136,62 @@ void CRBP::LogFirmwareVerison()
-   CLog::Log(LOGNOTICE, "Config:\n%s", response);
- }
- 
-+static void vsync_callback_static(DISPMANX_UPDATE_HANDLE_T u, void *arg)
-+{
-+  CRBP *rbp = reinterpret_cast<CRBP*>(arg);
-+  rbp->VSyncCallback();
-+}
-+
-+void CRBP::VSyncCallback()
-+{
-+  CSingleLock lock(m_vsync_lock);
-+  m_vsync_count++;
-+  m_last_vsync = CurrentHostCounter();
-+  m_vsync_cond.notifyAll();
-+}
-+
-+unsigned int CRBP::VsyncCount()
-+{
-+  CSingleLock lock(m_vsync_lock);
-+  return m_vsync_count;
-+}
-+
-+int64_t CRBP::LastVsync()
-+{
-+  CSingleLock lock(m_vsync_lock);
-+  return m_last_vsync;
-+}
-+
-+unsigned int CRBP::WaitVsync(unsigned int target)
-+{
-+  CSingleLock vlock(m_vsync_lock);
-+  DISPMANX_DISPLAY_HANDLE_T display = m_display;
-+  XbmcThreads::EndTime delay(50);
-+  if (target == ~0U)
-+    target = m_vsync_count+1;
-+  while (!delay.IsTimePast())
-+  {
-+    CSingleLock lock(m_critSection);
-+    if (m_vsync_count >= target)
-+      break;
-+    lock.Leave();
-+    if (!m_vsync_cond.wait(vlock, delay.MillisLeft()))
-+      break;
-+  }
-+  if (m_vsync_count < target)
-+    CLog::Log(LOGDEBUG, "CRBP::%s no  vsync %d/%d display:%x(%x) delay:%d", __FUNCTION__, m_vsync_count, target, m_display, display, delay.MillisLeft());
-+
-+  return m_vsync_count;
-+}
-+
- DISPMANX_DISPLAY_HANDLE_T CRBP::OpenDisplay(uint32_t device)
- {
-+  CSingleLock lock(m_critSection);
-   if (m_display == DISPMANX_NO_HANDLE)
-   {
-     m_display = vc_dispmanx_display_open( 0 /*screen*/ );
-+    int s = vc_dispmanx_vsync_callback(m_display, vsync_callback_static, (void *)this);
-+    assert(s == 0);
-     init_cursor();
-   }
-   return m_display;
-@@ -145,16 +199,20 @@ DISPMANX_DISPLAY_HANDLE_T CRBP::OpenDisplay(uint32_t device)
- 
- void CRBP::CloseDisplay(DISPMANX_DISPLAY_HANDLE_T display)
- {
-+  CSingleLock lock(m_critSection);
-   assert(display == m_display);
-+  int s = vc_dispmanx_vsync_callback(m_display, NULL, NULL);
-+  assert(s == 0);
-+  uninit_cursor();
-   vc_dispmanx_display_close(m_display);
-   m_display = DISPMANX_NO_HANDLE;
--  uninit_cursor();
- }
- 
- void CRBP::GetDisplaySize(int &width, int &height)
- {
-+  CSingleLock lock(m_critSection);
-   DISPMANX_MODEINFO_T info;
--  if (vc_dispmanx_display_get_info(m_display, &info) == 0)
-+  if (m_display != DISPMANX_NO_HANDLE && vc_dispmanx_display_get_info(m_display, &info) == 0)
-   {
-     width = info.width;
-     height = info.height;
-@@ -183,13 +241,13 @@ unsigned char *CRBP::CaptureDisplay(int width, int height, int *pstride, bool sw
-     flags |= DISPMANX_SNAPSHOT_PACK;
- 
-   stride = ((width + 15) & ~15) * 4;
--  image = new unsigned char [height * stride];
- 
--  if (image)
-+  CSingleLock lock(m_critSection);
-+  if (m_display != DISPMANX_NO_HANDLE)
-   {
-+    image = new unsigned char [height * stride];
-     resource = vc_dispmanx_resource_create( VC_IMAGE_RGBA32, width, height, &vc_image_ptr );
- 
--    assert(m_display != DISPMANX_NO_HANDLE);
-     vc_dispmanx_snapshot(m_display, resource, (DISPMANX_TRANSFORM_T)flags);
- 
-     vc_dispmanx_rect_set(&rect, 0, 0, width, height);
-@@ -201,35 +259,6 @@ unsigned char *CRBP::CaptureDisplay(int width, int height, int *pstride, bool sw
-   return image;
- }
- 
--
--static void vsync_callback(DISPMANX_UPDATE_HANDLE_T u, void *arg)
--{
--  CEvent *sync = (CEvent *)arg;
--  sync->Set();
--}
--
--void CRBP::WaitVsync()
--{
--  int s;
--  DISPMANX_DISPLAY_HANDLE_T m_display = vc_dispmanx_display_open( 0 /*screen*/ );
--  if (m_display == DISPMANX_NO_HANDLE)
--  {
--    CLog::Log(LOGDEBUG, "CRBP::%s skipping while display closed", __func__);
--    return;
--  }
--  m_vsync.Reset();
--  s = vc_dispmanx_vsync_callback(m_display, vsync_callback, (void *)&m_vsync);
--  if (s == 0)
--  {
--    m_vsync.WaitMSec(1000);
--  }
--  else assert(0);
--  s = vc_dispmanx_vsync_callback(m_display, NULL, NULL);
--  assert(s == 0);
--  vc_dispmanx_display_close( m_display );
--}
--
--
- void CRBP::Deinitialize()
- {
-   if (m_omx_image_init)
-@@ -260,6 +289,7 @@ void CRBP::Deinitialize()
- 
- double CRBP::AdjustHDMIClock(double adjust)
- {
-+  CSingleLock lock(m_critSection);
-   char response[80];
-   vc_gencmd(response, sizeof response, "hdmi_adjust_clock %f", adjust);
-   char *p = strchr(response, '=');
-diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
-index 2eee35d..3b59cd9 100644
---- a/xbmc/linux/RBP.h
-+++ b/xbmc/linux/RBP.h
-@@ -77,7 +77,10 @@ public:
-   // stride can be null for packed output
-   unsigned char *CaptureDisplay(int width, int height, int *stride, bool swap_red_blue, bool video_only = true);
-   DllOMX *GetDllOMX() { return m_OMX ? m_OMX->GetDll() : NULL; }
--  void WaitVsync();
-+  unsigned int WaitVsync(unsigned int target = ~0U);
-+  int64_t LastVsync();
-+  unsigned int VsyncCount();
-+  void VSyncCallback();
-   double AdjustHDMIClock(double adjust);
-   double GetAdjustHDMIClock() { return m_last_pll_adjust; }
-   int GetMBox() { return m_mb; }
-@@ -94,7 +97,10 @@ private:
-   bool       m_codec_wvc1_enabled;
-   COMXCore   *m_OMX;
-   DISPMANX_DISPLAY_HANDLE_T m_display;
--  CEvent     m_vsync;
-+  CCriticalSection m_vsync_lock;
-+  XbmcThreads::ConditionVariable m_vsync_cond;
-+  unsigned int m_vsync_count;
-+  int64_t m_last_vsync;
-   class DllLibOMXCore;
-   CCriticalSection m_critSection;
-   double m_last_pll_adjust;
-
-From 2e13233a89859c10902059dd34160582af62ee1e Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 24 Mar 2016 23:24:18 +0000
-Subject: [PATCH 89/93] [mmalrenderer] Wait for vsync before submitting to mmal
- when display sync is disabled
-
-This avoids an issue where video occasionally goes stuttery after a seek, until the next pause/play or seek.
-The issue is when display sync is disabled, and framerate of video matches display, and render times are coincident with vsync
-you find that depending on timestamp/scheduling jitter, you may or may not get an update each vsync resulting in stuttery video.
-
-Some scheme to force render times to be dependent on vsync is required. We do this by using a queue that is popped following vsyncs.
-We ensure the queue always has 1 or 2 frames so it doesn't underrun with a late frame, but this adds a frame of latency.
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp | 66 ++++++++++++++++++++++++++++--
- xbmc/cores/VideoRenderers/MMALRenderer.h   |  6 ++-
- 2 files changed, 68 insertions(+), 4 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index ad3f66f..76c4682 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -239,7 +239,7 @@ bool CMMALRenderer::init_vout(ERenderFormat format)
-   return true;
- }
- 
--CMMALRenderer::CMMALRenderer()
-+CMMALRenderer::CMMALRenderer() : CThread("MMALRenderer")
- {
-   CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
-   m_vout = NULL;
-@@ -253,15 +253,69 @@ CMMALRenderer::CMMALRenderer()
-   m_iYV12RenderBuffer = 0;
-   m_inflight = 0;
-   m_sharpness = -2.0f;
-+  m_queue = mmal_queue_create();
-+  Create();
- }
- 
- CMMALRenderer::~CMMALRenderer()
- {
-   CSingleLock lock(m_sharedSection);
-   CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
-+  StopThread(true);
-+  mmal_queue_destroy(m_queue);
-   UnInit();
- }
- 
-+void CMMALRenderer::Process()
-+{
-+  SetPriority(THREAD_PRIORITY_ABOVE_NORMAL);
-+  while (!m_bStop)
-+  {
-+    g_RBP.WaitVsync();
-+    double dfps = g_graphicsContext.GetFPS();
-+    if (dfps <= 0.0)
-+      dfps = m_fps;
-+    // This algorithm is basically making the decision according to Bresenham's line algorithm.  Imagine drawing a line where x-axis is display frames, and y-axis is video frames
-+    m_error += m_fps / dfps;
-+    // we may need to discard frames if queue length gets too high or video frame rate is above display frame rate
-+    while (mmal_queue_length(m_queue) > 2 || m_error > 1.0)
-+    {
-+      if (m_error > 1.0)
-+        m_error -= 1.0;
-+      MMAL_BUFFER_HEADER_T *buffer = mmal_queue_get(m_queue);
-+      if (buffer)
-+      {
-+        if (m_format == RENDER_FMT_MMAL)
-+        {
-+          CMMALVideoBuffer *omvb = (CMMALVideoBuffer *)buffer->user_data;
-+          assert(buffer == omvb->mmal_buffer);
-+          m_inflight--;
-+          omvb->Release();
-+        }
-+        else if (m_format == RENDER_FMT_YUV420P)
-+        {
-+          CYUVVideoBuffer *omvb = (CYUVVideoBuffer *)buffer->user_data;
-+          assert(buffer == omvb->mmal_buffer);
-+          m_inflight--;
-+          omvb->Release();
-+        }
-+        if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+          CLog::Log(LOGDEBUG, "%s::%s - discard buffer:%p vsync:%d queue:%d diff:%f", CLASSNAME, __func__, buffer, g_RBP.VsyncCount(), mmal_queue_length(m_queue), m_error);
-+      }
-+    }
-+    // this is case where we would like to display a new frame
-+    if (m_error > 0.0)
-+    {
-+      m_error -= 1.0;
-+      MMAL_BUFFER_HEADER_T *buffer = mmal_queue_get(m_queue);
-+      if (buffer)
-+        mmal_port_send_buffer(m_vout_input, buffer);
-+      if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+        CLog::Log(LOGDEBUG, "%s::%s - buffer:%p vsync:%d queue:%d diff:%f", CLASSNAME, __func__, buffer, g_RBP.VsyncCount(), mmal_queue_length(m_queue), m_error);
-+    }
-+  }
-+}
-+
- void CMMALRenderer::AddProcessor(CMMALVideoBuffer *buffer, int index)
- {
- #if defined(MMAL_DEBUG_VERBOSE)
-@@ -496,7 +550,10 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-         return;
-       omvb->Acquire();
-       omvb->mmal_buffer->flags |= MMAL_BUFFER_HEADER_FLAG_USER1 | MMAL_BUFFER_HEADER_FLAG_USER2;
--      mmal_port_send_buffer(m_vout_input, omvb->mmal_buffer);
-+      if (!CSettings::GetInstance().GetBool("videoplayer.usedisplayasclock"))
-+        mmal_queue_put(m_queue, omvb->mmal_buffer);
-+      else
-+        mmal_port_send_buffer(m_vout_input, omvb->mmal_buffer);
-     }
-     else
-       CLog::Log(LOGDEBUG, "%s::%s - No buffer to update", CLASSNAME, __func__);
-@@ -516,7 +573,10 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-       omvb->Acquire();
-       omvb->mmal_buffer->flags |= MMAL_BUFFER_HEADER_FLAG_USER1 | MMAL_BUFFER_HEADER_FLAG_USER2;
-       omvb->mmal_buffer->user_data = omvb;
--      mmal_port_send_buffer(m_vout_input, omvb->mmal_buffer);
-+      if (!CSettings::GetInstance().GetBool("videoplayer.usedisplayasclock"))
-+        mmal_queue_put(m_queue, omvb->mmal_buffer);
-+      else
-+        mmal_port_send_buffer(m_vout_input, omvb->mmal_buffer);
-     }
-     else
-       CLog::Log(LOGDEBUG, "%s::%s - No buffer to update: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.h b/xbmc/cores/VideoRenderers/MMALRenderer.h
-index a71e645..34cb294 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.h
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.h
-@@ -29,6 +29,7 @@
- #include "cores/dvdplayer/DVDStreamInfo.h"
- #include "guilib/Geometry.h"
- #include "BaseRenderer.h"
-+#include "threads/Thread.h"
- 
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/util/mmal_util.h>
-@@ -55,7 +56,7 @@ protected:
-   long m_refs;
- };
- 
--class CMMALRenderer : public CBaseRenderer
-+class CMMALRenderer : public CBaseRenderer, public CThread
- {
-   struct YUVBUFFER
-   {
-@@ -66,6 +67,7 @@ public:
-   CMMALRenderer();
-   ~CMMALRenderer();
- 
-+  void Process();
-   virtual void Update();
-   virtual void SetupScreenshot() {};
- 
-@@ -125,6 +127,8 @@ protected:
-   MMAL_COMPONENT_T *m_vout;
-   MMAL_PORT_T *m_vout_input;
-   MMAL_POOL_T *m_vout_input_pool;
-+  MMAL_QUEUE_T *m_queue;
-+  double m_error;
- 
-   bool init_vout(ERenderFormat format);
-   void ReleaseBuffers();
-
-From 1f63176ba9c91a1f1e58dec440a56e90ee944583 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 23 Mar 2016 16:57:19 +0000
-Subject: [PATCH 90/93] mmal: Include mmal renderer logging in video category
-
-On Pi the decoder and renderer are closely coupled so combining the
-logging category makes sense to me.
----
- xbmc/cores/VideoRenderers/MMALRenderer.cpp | 109 +++++++++++------------------
- 1 file changed, 42 insertions(+), 67 deletions(-)
-
-diff --git a/xbmc/cores/VideoRenderers/MMALRenderer.cpp b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-index 76c4682..e1099da 100644
---- a/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-+++ b/xbmc/cores/VideoRenderers/MMALRenderer.cpp
-@@ -36,10 +36,6 @@
- 
- #define CLASSNAME "CMMALRenderer"
- 
--#ifdef _DEBUG
--#define MMAL_DEBUG_VERBOSE
--#endif
--
- 
- CYUVVideoBuffer::CYUVVideoBuffer()
- {
-@@ -56,9 +52,8 @@ CYUVVideoBuffer::~CYUVVideoBuffer()
- CYUVVideoBuffer *CYUVVideoBuffer::Acquire()
- {
-   long count = AtomicIncrement(&m_refs);
--#ifdef MMAL_DEBUG_VERBOSE
--  CLog::Log(LOGDEBUG, "%s::%s omvb:%p mmal:%p ref:%ld", CLASSNAME, __func__, this, mmal_buffer, count);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s omvb:%p mmal:%p ref:%ld", CLASSNAME, __func__, this, mmal_buffer, count);
-   (void)count;
-   return this;
- }
-@@ -66,9 +61,8 @@ CYUVVideoBuffer *CYUVVideoBuffer::Acquire()
- long CYUVVideoBuffer::Release()
- {
-   long count = AtomicDecrement(&m_refs);
--#ifdef MMAL_DEBUG_VERBOSE
--  CLog::Log(LOGDEBUG, "%s::%s omvb:%p mmal:%p ref:%ld", CLASSNAME, __func__, this, mmal_buffer, count);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s omvb:%p mmal:%p ref:%ld", CLASSNAME, __func__, this, mmal_buffer, count);
-   if (count == 0)
-   {
-     mmal_buffer_header_release(mmal_buffer);
-@@ -88,9 +82,8 @@ CRenderInfo CMMALRenderer::GetRenderInfo()
-   if (!m_bMMALConfigured)
-     m_bMMALConfigured = init_vout(RENDER_FMT_MMAL);
- 
--  #if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s cookie:%p", CLASSNAME, __func__, (void *)m_vout_input_pool);
--  #endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s cookie:%p", CLASSNAME, __func__, (void *)m_vout_input_pool);
- 
-   info.max_buffer_size = NUM_BUFFERS;
-   info.optimal_buffer_size = NUM_BUFFERS;
-@@ -112,18 +105,16 @@ void CMMALRenderer::vout_input_port_cb(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *
-   {
-     CMMALVideoBuffer *omvb = (CMMALVideoBuffer *)buffer->user_data;
-     assert(buffer == omvb->mmal_buffer);
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s port:%p omvb:%p mmal:%p len:%d cmd:%x flags:%x flight:%d", CLASSNAME, __func__, port, omvb, omvb->mmal_buffer, buffer->length, buffer->cmd, buffer->flags, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s port:%p omvb:%p mmal:%p len:%d cmd:%x flags:%x flight:%d", CLASSNAME, __func__, port, omvb, omvb->mmal_buffer, buffer->length, buffer->cmd, buffer->flags, m_inflight);
-     omvb->Release();
-   }
-   else if (m_format == RENDER_FMT_YUV420P)
-   {
-     CYUVVideoBuffer *omvb = (CYUVVideoBuffer *)buffer->user_data;
-     assert(buffer == omvb->mmal_buffer);
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s port:%p omvb:%p mmal:%p len:%d cmd:%x flags:%x flight:%d", CLASSNAME, __func__, port, omvb, omvb->mmal_buffer, buffer->length, buffer->cmd, buffer->flags, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s port:%p omvb:%p mmal:%p len:%d cmd:%x flags:%x flight:%d", CLASSNAME, __func__, port, omvb, omvb->mmal_buffer, buffer->length, buffer->cmd, buffer->flags, m_inflight);
-     m_inflight--;
-     omvb->Release();
-   }
-@@ -318,9 +309,8 @@ void CMMALRenderer::Process()
- 
- void CMMALRenderer::AddProcessor(CMMALVideoBuffer *buffer, int index)
- {
--#if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s - %p (%p) %i", CLASSNAME, __func__, buffer, buffer->mmal_buffer, index);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s - %p (%p) %i", CLASSNAME, __func__, buffer, buffer->mmal_buffer, index);
- 
-   YUVBUFFER &buf = m_buffers[index];
-   assert(!buf.MMALBuffer);
-@@ -372,17 +362,15 @@ int CMMALRenderer::GetImage(YV12Image *image, int source, bool readonly)
- {
-   if (!image || source < 0)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - invalid: image:%p source:%d ro:%d flight:%d", CLASSNAME, __func__, image, source, readonly, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - invalid: image:%p source:%d ro:%d flight:%d", CLASSNAME, __func__, image, source, readonly, m_inflight);
-     return -1;
-   }
- 
-   if (m_format == RENDER_FMT_MMAL)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - MMAL: image:%p source:%d ro:%d flight:%d", CLASSNAME, __func__, image, source, readonly, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - MMAL: image:%p source:%d ro:%d flight:%d", CLASSNAME, __func__, image, source, readonly, m_inflight);
-   }
-   else if (m_format == RENDER_FMT_YUV420P)
-   {
-@@ -425,9 +413,8 @@ int CMMALRenderer::GetImage(YV12Image *image, int source, bool readonly)
-     if (!buf.YUVBuffer)
-       return -1;
-     buf.YUVBuffer->mmal_buffer = buffer;
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - YUV: image:%p source:%d ro:%d omvb:%p mmal:%p flight:%d", CLASSNAME, __func__, image, source, readonly, buf.YUVBuffer, buffer, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - YUV: image:%p source:%d ro:%d omvb:%p mmal:%p flight:%d", CLASSNAME, __func__, image, source, readonly, buf.YUVBuffer, buffer, m_inflight);
-     buf.YUVBuffer->Acquire();
-   }
-   else assert(0);
-@@ -440,16 +427,14 @@ void CMMALRenderer::ReleaseBuffer(int idx)
-   CSingleLock lock(m_sharedSection);
-   if (!m_bMMALConfigured)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - not configured: source:%d", CLASSNAME, __func__, idx);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - not configured: source:%d", CLASSNAME, __func__, idx);
-     return;
-   }
-   if (m_format == RENDER_FMT_BYPASS)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - bypass: source:%d", CLASSNAME, __func__, idx);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - bypass: source:%d", CLASSNAME, __func__, idx);
-     return;
-   }
- 
-@@ -457,17 +442,15 @@ void CMMALRenderer::ReleaseBuffer(int idx)
-   if (m_format == RENDER_FMT_MMAL)
-   {
-     CMMALVideoBuffer *omvb = buffer->MMALBuffer;
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - MMAL: source:%d omvb:%p mmal:%p", CLASSNAME, __func__, idx, omvb, omvb ? omvb->mmal_buffer:NULL);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - MMAL: source:%d omvb:%p mmal:%p", CLASSNAME, __func__, idx, omvb, omvb ? omvb->mmal_buffer:NULL);
-     SAFE_RELEASE(buffer->MMALBuffer);
-   }
-   else if (m_format == RENDER_FMT_YUV420P)
-   {
-     CYUVVideoBuffer *omvb = buffer->YUVBuffer;
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - YUV: source:%d omvb:%p mmal:%p flight:%d", CLASSNAME, __func__, idx, omvb, omvb ? omvb->mmal_buffer:NULL, m_inflight);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - YUV: source:%d omvb:%p mmal:%p flight:%d", CLASSNAME, __func__, idx, omvb, omvb ? omvb->mmal_buffer:NULL, m_inflight);
-     if (omvb && omvb->mmal_buffer)
-       SAFE_RELEASE(buffer->YUVBuffer);
-   }
-@@ -491,9 +474,8 @@ void CMMALRenderer::Flush()
- 
- void CMMALRenderer::Update()
- {
--#if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
-   if (!m_bConfigured) return;
-   ManageDisplay();
- }
-@@ -505,9 +487,8 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
- 
-   if (!m_bConfigured)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - not configured: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - not configured: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
-     return;
-   }
- 
-@@ -529,9 +510,8 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
- 
-   if (m_format == RENDER_FMT_BYPASS)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - bypass: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - bypass: clear:%d flags:%x alpha:%d source:%d", CLASSNAME, __func__, clear, flags, alpha, source);
-     return;
-   }
-   SetVideoRect(m_sourceRect, m_destRect);
-@@ -542,9 +522,8 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-     CMMALVideoBuffer *omvb = buffer->MMALBuffer;
-     if (omvb && omvb->mmal_buffer)
-     {
--#if defined(MMAL_DEBUG_VERBOSE)
--      CLog::Log(LOGDEBUG, "%s::%s - MMAL: clear:%d flags:%x alpha:%d source:%d omvb:%p mmal:%p mflags:%x", CLASSNAME, __func__, clear, flags, alpha, source, omvb, omvb->mmal_buffer, omvb->mmal_buffer->flags);
--#endif
-+      if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+        CLog::Log(LOGDEBUG, "%s::%s - MMAL: clear:%d flags:%x alpha:%d source:%d omvb:%p mmal:%p mflags:%x", CLASSNAME, __func__, clear, flags, alpha, source, omvb, omvb->mmal_buffer, omvb->mmal_buffer->flags);
-       // we only want to upload frames once
-       if (omvb->mmal_buffer->flags & MMAL_BUFFER_HEADER_FLAG_USER1)
-         return;
-@@ -563,9 +542,8 @@ void CMMALRenderer::RenderUpdate(bool clear, DWORD flags, DWORD alpha)
-     CYUVVideoBuffer *omvb = buffer->YUVBuffer;
-     if (omvb && omvb->mmal_buffer)
-     {
--#if defined(MMAL_DEBUG_VERBOSE)
--      CLog::Log(LOGDEBUG, "%s::%s - YUV: clear:%d flags:%x alpha:%d source:%d omvb:%p mmal:%p mflags:%x", CLASSNAME, __func__, clear, flags, alpha, source, omvb, omvb->mmal_buffer, omvb->mmal_buffer->flags);
--#endif
-+      if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+        CLog::Log(LOGDEBUG, "%s::%s - YUV: clear:%d flags:%x alpha:%d source:%d omvb:%p mmal:%p mflags:%x", CLASSNAME, __func__, clear, flags, alpha, source, omvb, omvb->mmal_buffer, omvb->mmal_buffer->flags);
-       // we only want to upload frames once
-       if (omvb->mmal_buffer->flags & MMAL_BUFFER_HEADER_FLAG_USER1)
-         return;
-@@ -589,15 +567,13 @@ void CMMALRenderer::FlipPage(int source)
-   CSingleLock lock(m_sharedSection);
-   if (!m_bConfigured || m_format == RENDER_FMT_BYPASS)
-   {
--#if defined(MMAL_DEBUG_VERBOSE)
--    CLog::Log(LOGDEBUG, "%s::%s - not configured: source:%d", CLASSNAME, __func__, source);
--#endif
-+    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+      CLog::Log(LOGDEBUG, "%s::%s - not configured: source:%d", CLASSNAME, __func__, source);
-     return;
-   }
- 
--#if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s - source:%d", CLASSNAME, __func__, source);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s - source:%d", CLASSNAME, __func__, source);
- 
-   m_iYV12RenderBuffer = source;
- }
-@@ -630,9 +606,8 @@ unsigned int CMMALRenderer::PreInit()
- 
- void CMMALRenderer::ReleaseBuffers()
- {
--#if defined(MMAL_DEBUG_VERBOSE)
--  CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
--#endif
-+  if (g_advancedSettings.CanLogComponent(LOGVIDEO))
-+    CLog::Log(LOGDEBUG, "%s::%s", CLASSNAME, __func__);
-   for (int i=0; i<NUM_BUFFERS; i++)
-     ReleaseBuffer(i);
- }
-
-From 6a008777d9f6c0b2b8724fc66cecf74ac7c32383 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 23 Mar 2016 17:34:48 +0000
-Subject: [PATCH 91/93] rendermanager: Increase configure timeout to see if it
- fixes video playing in background issues
-
----
- xbmc/cores/VideoRenderers/RenderManager.cpp | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/cores/VideoRenderers/RenderManager.cpp b/xbmc/cores/VideoRenderers/RenderManager.cpp
-index 7a99ac4..4b03c86 100644
---- a/xbmc/cores/VideoRenderers/RenderManager.cpp
-+++ b/xbmc/cores/VideoRenderers/RenderManager.cpp
-@@ -244,7 +244,7 @@ bool CXBMCRenderManager::Configure(unsigned int width, unsigned int height, unsi
-   CSingleLock    lock2(m_presentlock);
- 
-   /* make sure any queued frame was fully presented */
--  XbmcThreads::EndTime endtime(5000);
-+  XbmcThreads::EndTime endtime(10000);
-   while(m_presentstep != PRESENT_IDLE && m_presentstep != PRESENT_READY)
-   {
-     if(endtime.IsTimePast())
-
-From 67223b6440475c4797aa2aa1949f73c078114474 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Wed, 23 Mar 2016 17:39:47 +0000
-Subject: [PATCH 92/93] Revert "[rbp] Use default resampling setting on Pi2"
-
-This reverts commit e6b2f1693480ad5d8062acaed512393e72fb9b1d.
----
- system/settings/rbp2.xml | 5 -----
- 1 file changed, 5 deletions(-)
-
-diff --git a/system/settings/rbp2.xml b/system/settings/rbp2.xml
-index 52778ec..8cc8f19 100644
---- a/system/settings/rbp2.xml
-+++ b/system/settings/rbp2.xml
-@@ -23,11 +23,6 @@
-         <setting id="audiooutput.ac3transcode" help="36429">
-         </setting>
-       </group>
--      <group id="1">
--        <setting id="audiooutput.processquality">
--          <default>30</default> <!-- AE_QUALITY_MID -->
--        </setting>
--      </group>
-     </category>
-   </section>
- </settings>
-
-From bb33be4220a3fd1ad131ec1f2218f7b4750fda98 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Thu, 31 Mar 2016 20:00:15 +0100
-Subject: [PATCH 93/93] Revert "[rbp] Make sync playback to display the default
- option"
-
-This reverts commit 492a2e7ac5fb1895b71b62f68918e74db053f0b9.
----
- system/settings/rbp.xml | 7 -------
- 1 file changed, 7 deletions(-)
-
-diff --git a/system/settings/rbp.xml b/system/settings/rbp.xml
-index f2a6892..1506035 100644
---- a/system/settings/rbp.xml
-+++ b/system/settings/rbp.xml
-@@ -1,13 +1,6 @@
- <?xml version="1.0" encoding="utf-8" ?>
- <settings>
-   <section id="videos">
--    <category id="videoplayer">
--      <group id="3">
--        <setting id="videoplayer.usedisplayasclock">
--          <default>true</default>
--        </setting>
--      </group>
--    </category>
-     <category id="videoacceleration">
-       <group id="1">
-         <visible>false</visible>
diff --git a/projects/WeTek_Play/patches/kodi/0001-Fix-ALSA-sound-output-for-Amlogic-based-devices.patch b/projects/WeTek_Play/patches/kodi/0001-Fix-ALSA-sound-output-for-Amlogic-based-devices.patch
index b15591d6b2..72c97da58d 100644
--- a/projects/WeTek_Play/patches/kodi/0001-Fix-ALSA-sound-output-for-Amlogic-based-devices.patch
+++ b/projects/WeTek_Play/patches/kodi/0001-Fix-ALSA-sound-output-for-Amlogic-based-devices.patch
@@ -1,17 +1,17 @@
-From 5e5453322e71c16b8b96b471a9e5c32b96d6b1af Mon Sep 17 00:00:00 2001
+From fde3d3d609e570aa3a8691a4e66e07dce1c80b25 Mon Sep 17 00:00:00 2001
 From: Alex Deryskyba <alex@codesnake.com>
 Date: Wed, 16 Apr 2014 22:02:01 +0300
-Subject: [PATCH 01/17] Fix ALSA sound output for Amlogic-based devices.
+Subject: [PATCH] [aml] Fix ALSA sound output for Amlogic-based devices.
 
 ---
- xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp |   19 +++++++++++++++++++
+ xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp | 19 +++++++++++++++++++
  1 file changed, 19 insertions(+)
 
 diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-index df50940..9f80c06 100644
+index fbccce0..6bf2c53 100644
 --- a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
 +++ b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-@@ -83,6 +83,17 @@ static unsigned int ALSASampleRateList[] =
+@@ -84,6 +84,17 @@ static unsigned int ALSASampleRateList[] =
    0
  };
  
@@ -29,7 +29,7 @@ index df50940..9f80c06 100644
  CAESinkALSA::CAESinkALSA() :
    m_bufferSize(0),
    m_formatSampleRateMul(0.0),
-@@ -741,12 +752,20 @@ bool CAESinkALSA::InitializeHW(const ALSAConfig &inconfig, ALSAConfig &outconfig
+@@ -748,12 +759,20 @@ bool CAESinkALSA::InitializeHW(const ALSAConfig &inconfig, ALSAConfig &outconfig
    */
    periodSize  = std::min(periodSize, (snd_pcm_uframes_t) sampleRate / 20);
    bufferSize  = std::min(bufferSize, (snd_pcm_uframes_t) sampleRate / 5);
@@ -50,6 +50,3 @@ index df50940..9f80c06 100644
  
    CLog::Log(LOGDEBUG, "CAESinkALSA::InitializeHW - Request: periodSize %lu, bufferSize %lu", periodSize, bufferSize);
  
--- 
-1.7.10.4
-
diff --git a/projects/WeTek_Play/patches/kodi/0008-Reorder-libraries-in-configure-script-to-prevent-lin.patch b/projects/WeTek_Play/patches/kodi/0008-Reorder-libraries-in-configure-script-to-prevent-lin.patch
index 99d12214ae..098407d013 100644
--- a/projects/WeTek_Play/patches/kodi/0008-Reorder-libraries-in-configure-script-to-prevent-lin.patch
+++ b/projects/WeTek_Play/patches/kodi/0008-Reorder-libraries-in-configure-script-to-prevent-lin.patch
@@ -1,7 +1,7 @@
-From d81c2be6e702206b7c00e25be4ee0c0750f1b84d Mon Sep 17 00:00:00 2001
+From 0367076d263846832cff68052f3de362f27e5bc6 Mon Sep 17 00:00:00 2001
 From: Alex Deryskyba <alex@codesnake.com>
 Date: Mon, 8 Sep 2014 23:29:40 +0300
-Subject: [PATCH 08/17] Reorder libraries in configure script to prevent
+Subject: [PATCH] [wetek] Reorder libraries in configure script to prevent
  linker errors when linking with libsmbclient
 
 Place libsmbclient before all other libraries to prevent linker errors when linking
@@ -9,14 +9,14 @@ with libsmbclient if the libc that is currently used doesn't contain some functi
 such as dn_expand (which are often included in libc), but are actually included in
 libresolv.
 ---
- configure.ac |    2 +-
+ configure.ac | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/configure.ac b/configure.ac
-index ff4022f..079fa27 100644
+index fbe2766..c668729 100644
 --- a/configure.ac
 +++ b/configure.ac
-@@ -1475,7 +1475,7 @@ fi
+@@ -1425,7 +1425,7 @@ fi
  # samba
  if test "x$use_samba" != "xno"; then
    PKG_CHECK_MODULES([SAMBA], [smbclient],
@@ -25,6 +25,3 @@ index ff4022f..079fa27 100644
      [AC_CHECK_LIB([smbclient], [main],,
        use_samba=no;AC_MSG_ERROR($missing_library))
        USE_LIBSMBCLIENT=0
--- 
-1.7.10.4
-
diff --git a/projects/WeTek_Play/patches/kodi/0009-aml-Change-the-sample-rates-that-are-supported-by-AL.patch b/projects/WeTek_Play/patches/kodi/0009-aml-Change-the-sample-rates-that-are-supported-by-AL.patch
index 278fdbdaeb..f24d6c80e1 100644
--- a/projects/WeTek_Play/patches/kodi/0009-aml-Change-the-sample-rates-that-are-supported-by-AL.patch
+++ b/projects/WeTek_Play/patches/kodi/0009-aml-Change-the-sample-rates-that-are-supported-by-AL.patch
@@ -1,22 +1,18 @@
-From a98954cef067010982c83568dfba45da0d43fe4d Mon Sep 17 00:00:00 2001
+From 813e6f31c8006190615cbf8ab031db2362bcf553 Mon Sep 17 00:00:00 2001
 From: Alex Deryskyba <alex@codesnake.com>
 Date: Fri, 19 Sep 2014 01:55:12 +0300
-Subject: [PATCH 09/17] [aml] Change the sample rates that are supported by
- ALSA but unsupported by HDMI to the closest supported
- value
+Subject: [PATCH] [aml] Change the sample rates that are supported by ALSA but
+ unsupported by HDMI to the closest supported value
 
-Conflicts:
-
-	xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
 ---
- xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp |   20 ++++++++++++++++++++
+ xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp | 20 ++++++++++++++++++++
  1 file changed, 20 insertions(+)
 
 diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-index 9f80c06..18303f8 100644
+index 6bf2c53..32ab888 100644
 --- a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
 +++ b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-@@ -666,6 +666,26 @@ bool CAESinkALSA::InitializeHW(const ALSAConfig &inconfig, ALSAConfig &outconfig
+@@ -673,6 +673,26 @@ bool CAESinkALSA::InitializeHW(const ALSAConfig &inconfig, ALSAConfig &outconfig
    snd_pcm_hw_params_set_access(m_pcm, hw_params, SND_PCM_ACCESS_RW_INTERLEAVED);
  
    unsigned int sampleRate   = inconfig.sampleRate;
@@ -43,6 +39,3 @@ index 9f80c06..18303f8 100644
    snd_pcm_hw_params_set_rate_near    (m_pcm, hw_params, &sampleRate, NULL);
  
    unsigned int channelCount = inconfig.channels;
--- 
-1.7.10.4
-
diff --git a/projects/WeTek_Play/patches/kodi/0010-aml-Fill-audio-packets-completely-when-resampling-to.patch b/projects/WeTek_Play/patches/kodi/0010-aml-Fill-audio-packets-completely-when-resampling-to.patch
index 706e10d117..634c68112e 100644
--- a/projects/WeTek_Play/patches/kodi/0010-aml-Fill-audio-packets-completely-when-resampling-to.patch
+++ b/projects/WeTek_Play/patches/kodi/0010-aml-Fill-audio-packets-completely-when-resampling-to.patch
@@ -1,20 +1,20 @@
-From 554f4a769d67155cbdf6f45ba256b5700baa65c1 Mon Sep 17 00:00:00 2001
+From b97f9b069a98984109829badcdf8ead92a29ee38 Mon Sep 17 00:00:00 2001
 From: Alex Deryskyba <alex@codesnake.com>
 Date: Sat, 20 Sep 2014 04:43:52 +0300
-Subject: [PATCH 10/17] [aml] Fill audio packets completely when resampling to
+Subject: [PATCH] [aml] Fill audio packets completely when resampling to
  prevent 'audio data unaligned' kernel warnings
 
 ---
- xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEBuffer.cpp |    4 ++++
+ xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEBuffer.cpp | 4 ++++
  1 file changed, 4 insertions(+)
 
 diff --git a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEBuffer.cpp b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEBuffer.cpp
-index 3b0a015..ef431a4 100644
+index dffded7..2f9dc47 100644
 --- a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEBuffer.cpp
 +++ b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAEBuffer.cpp
-@@ -143,7 +143,11 @@ CActiveAEBufferPoolResample::CActiveAEBufferPoolResample(AEAudioFormat inputForm
-   if (AE_IS_RAW(m_inputFormat.m_dataFormat))
-     m_inputFormat.m_dataFormat = AE_FMT_S16NE;
+@@ -157,7 +157,11 @@ CActiveAEBufferPoolResample::CActiveAEBufferPoolResample(AEAudioFormat inputForm
+     m_inputFormat.m_channelLayout += AE_CH_FC;
+   }
    m_resampler = NULL;
 +#ifdef HAS_LIBAMCODEC
 +  m_fillPackets = true;
@@ -24,6 +24,3 @@ index 3b0a015..ef431a4 100644
    m_drain = false;
    m_empty = true;
    m_procSample = NULL;
--- 
-1.7.10.4
-
diff --git a/projects/WeTek_Play/patches/kodi/0011-aml-Use-fpsrate-and-fpsscale-instead-of-rfpsrate-and.patch b/projects/WeTek_Play/patches/kodi/0011-aml-Use-fpsrate-and-fpsscale-instead-of-rfpsrate-and.patch
deleted file mode 100644
index fd4bf3281c..0000000000
--- a/projects/WeTek_Play/patches/kodi/0011-aml-Use-fpsrate-and-fpsscale-instead-of-rfpsrate-and.patch
+++ /dev/null
@@ -1,120 +0,0 @@
-From e065d31b7bac0a3fffc0f05922613090cc573709 Mon Sep 17 00:00:00 2001
-From: Alex Deryskyba <alex@codesnake.com>
-Date: Sun, 21 Sep 2014 17:17:14 +0300
-Subject: [PATCH 11/17] [aml] Use fpsrate and fpsscale instead of rfpsrate and
- rfpsscale to detect framerate
-
----
- xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp  |   12 ++-----
- .../DVDCodecs/Video/DVDVideoCodecAmlogic.cpp       |   38 +++++++++-----------
- 2 files changed, 20 insertions(+), 30 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-index 26db4a1..fcdad19 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-@@ -1465,14 +1465,8 @@ bool CAMLCodec::OpenDecoder(CDVDStreamInfo &hints)
-   am_private->video_ratio64    = ((int64_t)video_ratio.num << 32) | video_ratio.den;
- 
-   // handle video rate
--  if (hints.rfpsrate > 0 && hints.rfpsscale != 0)
-+  if (hints.fpsrate > 0 && hints.fpsscale != 0)
-   {
--    // check ffmpeg r_frame_rate 1st
--    am_private->video_rate = 0.5 + (float)UNIT_FREQ * hints.rfpsscale / hints.rfpsrate;
--  }
--  else if (hints.fpsrate > 0 && hints.fpsscale != 0)
--  {
--    // then ffmpeg avg_frame_rate next
-     am_private->video_rate = 0.5 + (float)UNIT_FREQ * hints.fpsscale / hints.fpsrate;
-   }
- 
-@@ -1545,8 +1539,8 @@ bool CAMLCodec::OpenDecoder(CDVDStreamInfo &hints)
-   CLog::Log(LOGDEBUG, "CAMLCodec::OpenDecoder "
-     "hints.width(%d), hints.height(%d), hints.codec(%d), hints.codec_tag(%d), hints.pid(%d)",
-     hints.width, hints.height, hints.codec, hints.codec_tag, hints.pid);
--  CLog::Log(LOGDEBUG, "CAMLCodec::OpenDecoder hints.fpsrate(%d), hints.fpsscale(%d), hints.rfpsrate(%d), hints.rfpsscale(%d), video_rate(%d)",
--    hints.fpsrate, hints.fpsscale, hints.rfpsrate, hints.rfpsscale, am_private->video_rate);
-+  CLog::Log(LOGDEBUG, "CAMLCodec::OpenDecoder hints.fpsrate(%d), hints.fpsscale(%d), video_rate(%d)",
-+    hints.fpsrate, hints.fpsscale, am_private->video_rate);
-   CLog::Log(LOGDEBUG, "CAMLCodec::OpenDecoder hints.aspect(%f), video_ratio.num(%d), video_ratio.den(%d)",
-     hints.aspect, video_ratio.num, video_ratio.den);
-   CLog::Log(LOGDEBUG, "CAMLCodec::OpenDecoder hints.orientation(%d), hints.forced_aspect(%d), hints.extrasize(%d)",
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecAmlogic.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecAmlogic.cpp
-index 24c1ab9..960aae1 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecAmlogic.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/DVDVideoCodecAmlogic.cpp
-@@ -74,9 +74,7 @@ bool CDVDVideoCodecAmlogic::Open(CDVDStreamInfo &hints, CDVDCodecOptions &option
-       m_mpeg2_sequence->width  = m_hints.width;
-       m_mpeg2_sequence->height = m_hints.height;
-       m_mpeg2_sequence->ratio  = m_hints.aspect;
--      if (m_hints.rfpsrate > 0 && m_hints.rfpsscale != 0)
--        m_mpeg2_sequence->rate = (float)m_hints.rfpsrate / m_hints.rfpsscale;
--      else if (m_hints.fpsrate > 0 && m_hints.fpsscale != 0)
-+      if (m_hints.fpsrate > 0 && m_hints.fpsscale != 0)
-         m_mpeg2_sequence->rate = (float)m_hints.fpsrate / m_hints.fpsscale;
-       else
-         m_mpeg2_sequence->rate = 1.0;
-@@ -374,43 +372,41 @@ void CDVDVideoCodecAmlogic::FrameRateTracking(uint8_t *pData, int iSize, double
-       {
-         default:
-         case 0x01:
--          m_hints.rfpsrate = 24000.0;
--          m_hints.rfpsscale = 1001.0;
-+          m_hints.fpsrate = 24000.0;
-+          m_hints.fpsscale = 1001.0;
-           break;
-         case 0x02:
--          m_hints.rfpsrate = 24000.0;
--          m_hints.rfpsscale = 1000.0;
-+          m_hints.fpsrate = 24000.0;
-+          m_hints.fpsscale = 1000.0;
-           break;
-         case 0x03:
--          m_hints.rfpsrate = 25000.0;
--          m_hints.rfpsscale = 1000.0;
-+          m_hints.fpsrate = 25000.0;
-+          m_hints.fpsscale = 1000.0;
-           break;
-         case 0x04:
--          m_hints.rfpsrate = 30000.0;
--          m_hints.rfpsscale = 1001.0;
-+          m_hints.fpsrate = 30000.0;
-+          m_hints.fpsscale = 1001.0;
-           break;
-         case 0x05:
--          m_hints.rfpsrate = 30000.0;
--          m_hints.rfpsscale = 1000.0;
-+          m_hints.fpsrate = 30000.0;
-+          m_hints.fpsscale = 1000.0;
-           break;
-         case 0x06:
--          m_hints.rfpsrate = 50000.0;
--          m_hints.rfpsscale = 1000.0;
-+          m_hints.fpsrate = 50000.0;
-+          m_hints.fpsscale = 1000.0;
-           break;
-         case 0x07:
--          m_hints.rfpsrate = 60000.0;
--          m_hints.rfpsscale = 1001.0;
-+          m_hints.fpsrate = 60000.0;
-+          m_hints.fpsscale = 1001.0;
-           break;
-         case 0x08:
--          m_hints.rfpsrate = 60000.0;
--          m_hints.rfpsscale = 1000.0;
-+          m_hints.fpsrate = 60000.0;
-+          m_hints.fpsscale = 1000.0;
-           break;
-       }
-       m_hints.width    = m_mpeg2_sequence->width;
-       m_hints.height   = m_mpeg2_sequence->height;
-       m_hints.aspect   = m_mpeg2_sequence->ratio;
--      m_hints.fpsrate  = m_hints.rfpsrate;
--      m_hints.fpsscale = m_hints.rfpsscale;
-     }
-     return;
-   }
--- 
-1.7.10.4
-
diff --git a/projects/WeTek_Play/patches/kodi/0012-Fix-incorrect-frame-rate-detection-of-some-videos-wi.patch b/projects/WeTek_Play/patches/kodi/0012-Fix-incorrect-frame-rate-detection-of-some-videos-wi.patch
deleted file mode 100644
index 9f70d0fa69..0000000000
--- a/projects/WeTek_Play/patches/kodi/0012-Fix-incorrect-frame-rate-detection-of-some-videos-wi.patch
+++ /dev/null
@@ -1,143 +0,0 @@
-From a9ff99a36f9e6cea70f7274312a127563af15dc5 Mon Sep 17 00:00:00 2001
-From: Alex Deryskyba <alex@codesnake.com>
-Date: Sun, 21 Sep 2014 17:20:25 +0300
-Subject: [PATCH 12/17] Fix incorrect frame rate detection of some videos with
- variable frame rate.
-
-Use FFMPEG's r_frame_rate, if it as valid, as a video stream frame rate, otherwise use avg_frame_rate.
-Also remove CDVDStreamInfo.rfpsscale, CDVDStreamInfo.rfpsrate, CDemuxStreamVideo.irFpsScale and CDemuxStreamVideo.irFpsRate,
-they are not needed anymore.
----
- xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h        |  4 ---
- .../cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 29 ++++++++--------------
- xbmc/cores/dvdplayer/DVDStreamInfo.cpp             |  8 ------
- xbmc/cores/dvdplayer/DVDStreamInfo.h               |  2 --
- 4 files changed, 10 insertions(+), 33 deletions(-)
-
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-index d69991e..faf3c9b 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemux.h
-@@ -151,8 +151,6 @@ public:
-   {
-     iFpsScale = 0;
-     iFpsRate = 0;
--    irFpsScale = 0;
--    irFpsRate = 0;
-     iHeight = 0;
-     iWidth = 0;
-     fAspect = 0.0;
-@@ -167,8 +165,6 @@ public:
-   virtual ~CDemuxStreamVideo() {}
-   int iFpsScale; // scale of 1000 and a rate of 29970 will result in 29.97 fps
-   int iFpsRate;
--  int irFpsScale;
--  int irFpsRate;
-   int iHeight; // height of the stream reported by the demuxer
-   int iWidth; // width of the stream reported by the demuxer
-   float fAspect; // display aspect of stream
-diff --git a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-index 1315117..5367b28 100644
---- a/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-+++ b/xbmc/cores/dvdplayer/DVDDemuxers/DVDDemuxFFmpeg.cpp
-@@ -1125,34 +1125,25 @@ CDemuxStream* CDVDDemuxFFmpeg::AddStream(int iId)
- #else
-         AVRational r_frame_rate = pStream->r_frame_rate;
- #endif
-+        int rFrameRate = 0;
-+        if (r_frame_rate.den && r_frame_rate.num)
-+          rFrameRate = r_frame_rate.num / r_frame_rate.den;
-+        bool rFrameRateValid = rFrameRate >= 5 && rFrameRate <= 100;
- 
--        //average fps is more accurate for mkv files
--        if (m_bMatroska && pStream->avg_frame_rate.den && pStream->avg_frame_rate.num)
--        {
--          st->iFpsRate = pStream->avg_frame_rate.num;
--          st->iFpsScale = pStream->avg_frame_rate.den;
--        }
--        else if(r_frame_rate.den && r_frame_rate.num)
-+        if (rFrameRateValid)
-         {
-           st->iFpsRate = r_frame_rate.num;
-           st->iFpsScale = r_frame_rate.den;
-         }
--        else
--        {
--          st->iFpsRate  = 0;
--          st->iFpsScale = 0;
--        }
--
--        // added for aml hw decoder, mkv frame-rate can be wrong.
--        if (r_frame_rate.den && r_frame_rate.num)
-+        else if(pStream->avg_frame_rate.den && pStream->avg_frame_rate.num)
-         {
--          st->irFpsRate = r_frame_rate.num;
--          st->irFpsScale = r_frame_rate.den;
-+          st->iFpsRate = pStream->avg_frame_rate.num;
-+          st->iFpsScale = pStream->avg_frame_rate.den;
-         }
-         else
-         {
--          st->irFpsRate = 0;
--          st->irFpsScale = 0;
-+          st->iFpsRate  = 0;
-+          st->iFpsScale = 0;
-         }
- 
-         if (pStream->codec_info_nb_frames >  0
-diff --git a/xbmc/cores/dvdplayer/DVDStreamInfo.cpp b/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-index c1dbd85..03facbe 100644
---- a/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-+++ b/xbmc/cores/dvdplayer/DVDStreamInfo.cpp
-@@ -52,8 +52,6 @@ void CDVDStreamInfo::Clear()
- 
-   fpsscale = 0;
-   fpsrate  = 0;
--  rfpsscale= 0;
--  rfpsrate = 0;
-   height   = 0;
-   width    = 0;
-   aspect   = 0.0;
-@@ -97,8 +95,6 @@ bool CDVDStreamInfo::Equal(const CDVDStreamInfo& right, bool withextradata)
-   // VIDEO
-   if( fpsscale != right.fpsscale
-   ||  fpsrate  != right.fpsrate
--  ||  rfpsscale!= right.rfpsscale
--  ||  rfpsrate != right.rfpsrate
-   ||  height   != right.height
-   ||  width    != right.width
-   ||  stills   != right.stills
-@@ -159,8 +155,6 @@ void CDVDStreamInfo::Assign(const CDVDStreamInfo& right, bool withextradata)
-   // VIDEO
-   fpsscale = right.fpsscale;
-   fpsrate  = right.fpsrate;
--  rfpsscale= right.rfpsscale;
--  rfpsrate = right.rfpsrate;
-   height   = right.height;
-   width    = right.width;
-   aspect   = right.aspect;
-@@ -220,8 +214,6 @@ void CDVDStreamInfo::Assign(const CDemuxStream& right, bool withextradata)
-     const CDemuxStreamVideo *stream = static_cast<const CDemuxStreamVideo*>(&right);
-     fpsscale  = stream->iFpsScale;
-     fpsrate   = stream->iFpsRate;
--    rfpsscale = stream->irFpsScale;
--    rfpsrate  = stream->irFpsRate;
-     height    = stream->iHeight;
-     width     = stream->iWidth;
-     aspect    = stream->fAspect;
-diff --git a/xbmc/cores/dvdplayer/DVDStreamInfo.h b/xbmc/cores/dvdplayer/DVDStreamInfo.h
-index c0e22a2..8953ff3 100644
---- a/xbmc/cores/dvdplayer/DVDStreamInfo.h
-+++ b/xbmc/cores/dvdplayer/DVDStreamInfo.h
-@@ -58,8 +58,6 @@ public:
-   // VIDEO
-   int fpsscale; // scale of 1001 and a rate of 60000 will result in 59.94 fps
-   int fpsrate;
--  int rfpsscale;
--  int rfpsrate;
-   int height; // height of the stream reported by the demuxer
-   int width; // width of the stream reported by the demuxer
-   float aspect; // display aspect as reported by demuxer
--- 
-2.1.4
-
diff --git a/projects/WeTek_Play/patches/kodi/0016-aml-Disable-deinterlacing-for-HD-content-while-video.patch b/projects/WeTek_Play/patches/kodi/0016-aml-Disable-deinterlacing-for-HD-content-while-video.patch
index ac06dd2ec9..0bc4a80f05 100644
--- a/projects/WeTek_Play/patches/kodi/0016-aml-Disable-deinterlacing-for-HD-content-while-video.patch
+++ b/projects/WeTek_Play/patches/kodi/0016-aml-Disable-deinterlacing-for-HD-content-while-video.patch
@@ -1,19 +1,19 @@
-From 6aa16d7fe7e6dbe95bdca8069a16d2aa415adf37 Mon Sep 17 00:00:00 2001
+From a2adae2d6a5036b2af02448e6803175a02f368d1 Mon Sep 17 00:00:00 2001
 From: Alex Deryskyba <alex@codesnake.com>
 Date: Tue, 3 Feb 2015 17:58:19 +0100
-Subject: [PATCH 16/17] Disable deinterlacing for HD content while video is
- being played in a window to prevent screen blinking in 1080p50/60hz display
- modes
+Subject: [PATCH] [aml] Disable deinterlacing for HD content while video is
+ being played in a window
 
+... to prevent screen blinking in 1080p50/60hz display modes
 ---
- xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp | 5 +++++
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp | 5 +++++
  1 file changed, 5 insertions(+)
 
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-index 2fad224..74804de 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-@@ -2261,6 +2261,11 @@ void CAMLCodec::SetVideoRect(const CRect &SrcRect, const CRect &DestRect)
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp
+index 5dc6592..2103042 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp
+@@ -2259,6 +2259,11 @@ void CAMLCodec::SetVideoRect(const CRect &SrcRect, const CRect &DestRect)
    CLog::Log(LOGDEBUG, "CAMLCodec::SetVideoRect:m_stereo_view(%d)", m_stereo_view);
  #endif
  
@@ -25,6 +25,3 @@ index 2fad224..74804de 100644
    // goofy 0/1 based difference in aml axis coordinates.
    // fix them.
    dst_rect.x2--;
--- 
-2.1.4
-
diff --git a/projects/WeTek_Play/patches/kodi/0018-aml-Fix-stuttering-during-a-playback-of-a-video-with.patch b/projects/WeTek_Play/patches/kodi/0018-aml-Fix-stuttering-during-a-playback-of-a-video-with.patch
index 21415b4ead..eb55cce130 100644
--- a/projects/WeTek_Play/patches/kodi/0018-aml-Fix-stuttering-during-a-playback-of-a-video-with.patch
+++ b/projects/WeTek_Play/patches/kodi/0018-aml-Fix-stuttering-during-a-playback-of-a-video-with.patch
@@ -1,18 +1,18 @@
-From 7b1c6ca4cb5bf3310cbfe3b007063439be301b6b Mon Sep 17 00:00:00 2001
+From 3e6754f46448b9718d57b62e8ca122e6217d0b22 Mon Sep 17 00:00:00 2001
 From: Alex Deryskyba <alex@codesnake.com>
 Date: Mon, 2 Mar 2015 09:48:14 +0100
-Subject: [PATCH 18/26] [aml] Fix stuttering during a playback of a video with
- 23.976 FPS
+Subject: [PATCH] [aml] Fix stuttering during a playback of a video with 23.976
+ FPS
 
 ---
- xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp |    2 +-
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-index bf4c958..8d49de8 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/AMLCodec.cpp
-@@ -1932,7 +1932,7 @@ void CAMLCodec::Process()
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp
+index 2103042..edf8f8a 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/AMLCodec.cpp
+@@ -1974,7 +1974,7 @@ void CAMLCodec::Process()
  
          double error = app_pts - (double)pts_video/PTS_FREQ;
          double abs_error = fabs(error);
@@ -21,6 +21,3 @@ index bf4c958..8d49de8 100644
          {
            //CLog::Log(LOGDEBUG, "CAMLCodec::Process pts diff = %f", error);
            if (abs_error > 0.150)
--- 
-1.7.10.4
-
diff --git a/projects/WeTek_Play/patches/kodi/0019-aml-Ugly-workaround-to-show-DTS-AC3-caps.patch b/projects/WeTek_Play/patches/kodi/0019-aml-Ugly-workaround-to-show-DTS-AC3-caps.patch
new file mode 100644
index 0000000000..6d8d4a23b5
--- /dev/null
+++ b/projects/WeTek_Play/patches/kodi/0019-aml-Ugly-workaround-to-show-DTS-AC3-caps.patch
@@ -0,0 +1,27 @@
+From 11435d47a5351c55ef12661baf1adb167da513ef Mon Sep 17 00:00:00 2001
+From: Alex Deryskyba <alex@codesnake.com>
+Date: Tue, 30 Jun 2015 11:19:57 +0200
+Subject: [PATCH] [aml] Ugly workaround to show DTS/AC3 caps
+
+... but don't run into multi channel issues as we can only open 2 pcm channels
+---
+ xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
+index 32ab888..d4eb0be 100644
+--- a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
++++ b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
+@@ -1336,6 +1336,12 @@ void CAESinkALSA::EnumerateDevicesEx(AEDeviceInfoList &list, bool force)
+ 
+ AEDeviceType CAESinkALSA::AEDeviceTypeFromName(const std::string &name)
+ {
++#ifdef HAS_LIBAMCODEC
++  // ugly workaround to show DTS / AC3 caps
++  // but don't run into multi channel issues
++  // as we can only open 2 pcm channels
++  return AE_DEVTYPE_IEC958;
++#endif
+   if (name.substr(0, 4) == "hdmi")
+     return AE_DEVTYPE_HDMI;
+   else if (name.substr(0, 6) == "iec958" || name.substr(0, 5) == "spdif")
diff --git a/projects/WeTek_Play/patches/kodi/0020-aml-Add-support-for-4k-resolutions.patch b/projects/WeTek_Play/patches/kodi/0020-aml-Add-support-for-4k-resolutions.patch
new file mode 100644
index 0000000000..065c10f81a
--- /dev/null
+++ b/projects/WeTek_Play/patches/kodi/0020-aml-Add-support-for-4k-resolutions.patch
@@ -0,0 +1,214 @@
+From c90b4c2ced393524e81c0ec321afa7ec94f3b806 Mon Sep 17 00:00:00 2001
+From: Alex Deryskyba <alex@codesnake.com>
+Date: Wed, 1 Jul 2015 23:37:11 +0200
+Subject: [PATCH] [aml] Add support for 4k resolutions
+
+---
+ xbmc/utils/AMLUtils.cpp                     | 16 +++---
+ xbmc/windowing/egl/EGLNativeTypeAmlogic.cpp | 81 ++++++++++++++++++++++-------
+ xbmc/windowing/egl/EGLNativeTypeAmlogic.h   |  2 +
+ 3 files changed, 73 insertions(+), 26 deletions(-)
+
+diff --git a/xbmc/utils/AMLUtils.cpp b/xbmc/utils/AMLUtils.cpp
+index 1b54435..05f2cd0 100644
+--- a/xbmc/utils/AMLUtils.cpp
++++ b/xbmc/utils/AMLUtils.cpp
+@@ -449,8 +449,8 @@ bool aml_mode_to_resolution(const char *mode, RESOLUTION_INFO *res)
+   }
+   else if (StringUtils::EqualsNoCase(fromMode, "4k2ksmpte") || StringUtils::EqualsNoCase(fromMode, "smpte24hz"))
+   {
+-    res->iWidth = 1920;
+-    res->iHeight= 1080;
++    res->iWidth = 4096;
++    res->iHeight= 2160;
+     res->iScreenWidth = 4096;
+     res->iScreenHeight= 2160;
+     res->fRefreshRate = 24;
+@@ -467,8 +467,8 @@ bool aml_mode_to_resolution(const char *mode, RESOLUTION_INFO *res)
+   }
+   else if (StringUtils::EqualsNoCase(fromMode, "4k2k24hz") || StringUtils::EqualsNoCase(fromMode, "2160p24hz"))
+   {
+-    res->iWidth = 1920;
+-    res->iHeight= 1080;
++    res->iWidth = 3840;
++    res->iHeight= 2160;
+     res->iScreenWidth = 3840;
+     res->iScreenHeight= 2160;
+     res->fRefreshRate = 24;
+@@ -476,8 +476,8 @@ bool aml_mode_to_resolution(const char *mode, RESOLUTION_INFO *res)
+   }
+   else if (StringUtils::EqualsNoCase(fromMode, "4k2k25hz") || StringUtils::EqualsNoCase(fromMode, "2160p25hz"))
+   {
+-    res->iWidth = 1920;
+-    res->iHeight= 1080;
++    res->iWidth = 3840;
++    res->iHeight= 2160;
+     res->iScreenWidth = 3840;
+     res->iScreenHeight= 2160;
+     res->fRefreshRate = 25;
+@@ -494,8 +494,8 @@ bool aml_mode_to_resolution(const char *mode, RESOLUTION_INFO *res)
+   }
+   else if (StringUtils::EqualsNoCase(fromMode, "4k2k30hz") || StringUtils::EqualsNoCase(fromMode, "2160p30hz"))
+   {
+-    res->iWidth = 1920;
+-    res->iHeight= 1080;
++    res->iWidth = 3840;
++    res->iHeight= 2160;
+     res->iScreenWidth = 3840;
+     res->iScreenHeight= 2160;
+     res->fRefreshRate = 30;
+diff --git a/xbmc/windowing/egl/EGLNativeTypeAmlogic.cpp b/xbmc/windowing/egl/EGLNativeTypeAmlogic.cpp
+index 21256ef..2103f0b 100644
+--- a/xbmc/windowing/egl/EGLNativeTypeAmlogic.cpp
++++ b/xbmc/windowing/egl/EGLNativeTypeAmlogic.cpp
+@@ -65,7 +65,22 @@ void CEGLNativeTypeAmlogic::Initialize()
+ {
+   aml_permissions();
+   DisableFreeScale();
++  GetMaxResolution(m_maxResolution);
+ }
++
++void CEGLNativeTypeAmlogic::GetMaxResolution(RESOLUTION_INFO &maxResolution)
++{
++  std::vector<RESOLUTION_INFO> resolutions;
++  ProbeResolutions(resolutions);
++
++  maxResolution = {0};
++  for (size_t i = 0; i < resolutions.size(); i++)
++  {
++    if (resolutions[i].iScreenWidth > maxResolution.iScreenWidth || resolutions[i].iScreenHeight > maxResolution.iScreenHeight)
++      maxResolution = resolutions[i];
++  }
++}
++
+ void CEGLNativeTypeAmlogic::Destroy()
+ {
+   return;
+@@ -84,8 +99,8 @@ bool CEGLNativeTypeAmlogic::CreateNativeWindow()
+   if (!nativeWindow)
+     return false;
+ 
+-  nativeWindow->width = 1920;
+-  nativeWindow->height = 1080;
++  nativeWindow->width = m_maxResolution.iScreenWidth;
++  nativeWindow->height = m_maxResolution.iScreenHeight;
+   m_nativeWindow = nativeWindow;
+ 
+   SetFramebufferResolution(nativeWindow->width, nativeWindow->height);
+@@ -142,48 +157,78 @@ bool CEGLNativeTypeAmlogic::SetNativeResolution(const RESOLUTION_INFO &res)
+   }
+ #endif
+ 
+-  switch((int)(0.5 + res.fRefreshRate))
++  switch((int)(res.fRefreshRate*10))
+   {
+     default:
+-    case 60:
++    case 600:
+       switch(res.iScreenWidth)
+       {
+         default:
+         case 1280:
+-          SetDisplayResolution("720p");
++          return SetDisplayResolution("720p");
+           break;
+         case 1920:
+           if (res.dwFlags & D3DPRESENTFLAG_INTERLACED)
+-            SetDisplayResolution("1080i");
++            return SetDisplayResolution("1080i");
+           else
+-            SetDisplayResolution("1080p");
++            return SetDisplayResolution("1080p");
+           break;
+       }
+       break;
+-    case 50:
++    case 500:
+       switch(res.iScreenWidth)
+       {
+         default:
+         case 1280:
+-          SetDisplayResolution("720p50hz");
++          return SetDisplayResolution("720p50hz");
+           break;
+         case 1920:
+           if (res.dwFlags & D3DPRESENTFLAG_INTERLACED)
+-            SetDisplayResolution("1080i50hz");
++            return SetDisplayResolution("1080i50hz");
+           else
+-            SetDisplayResolution("1080p50hz");
++            return SetDisplayResolution("1080p50hz");
+           break;
+       }
+       break;
+-    case 30:
+-      SetDisplayResolution("1080p30hz");
++    case 300:
++      switch(res.iScreenWidth)
++      {
++        case 3840:
++          return SetDisplayResolution("4k2k30hz");
++          break;
++        default:
++          return SetDisplayResolution("1080p30hz");
++          break;
++      }
+       break;
+-    case 24:
+-      SetDisplayResolution("1080p24hz");
++    case 250:
++      switch(res.iScreenWidth)
++      {
++        case 3840:
++          return SetDisplayResolution("4k2k25hz");
++          break;
++        default:
++          return SetDisplayResolution("1080p25hz");
++          break;
++      }
++      break;
++    case 240:
++      switch(res.iScreenWidth)
++      {
++        case 3840:
++          return SetDisplayResolution("4k2k24hz");
++          break;
++        case 4096:
++          return SetDisplayResolution("4k2ksmpte");
++          break;
++        default:
++          return SetDisplayResolution("1080p24hz");
++          break;
++      }
+       break;
+   }
+ 
+-  return true;
++  return false;
+ }
+ 
+ bool CEGLNativeTypeAmlogic::ProbeResolutions(std::vector<RESOLUTION_INFO> &resolutions)
+@@ -280,8 +325,8 @@ void CEGLNativeTypeAmlogic::SetFramebufferResolution(int width, int height) cons
+     {
+       vinfo.xres = width;
+       vinfo.yres = height;
+-      vinfo.xres_virtual = 1920;
+-      vinfo.yres_virtual = 2160;
++      vinfo.xres_virtual = m_maxResolution.iScreenWidth;
++      vinfo.yres_virtual = m_maxResolution.iScreenHeight * 2;
+       vinfo.bits_per_pixel = 32;
+       vinfo.activate = FB_ACTIVATE_ALL;
+       ioctl(fd0, FBIOPUT_VSCREENINFO, &vinfo);
+diff --git a/xbmc/windowing/egl/EGLNativeTypeAmlogic.h b/xbmc/windowing/egl/EGLNativeTypeAmlogic.h
+index 6867c38..9ca41d4 100644
+--- a/xbmc/windowing/egl/EGLNativeTypeAmlogic.h
++++ b/xbmc/windowing/egl/EGLNativeTypeAmlogic.h
+@@ -55,6 +55,8 @@ class CEGLNativeTypeAmlogic : public CEGLNativeType
+ private:
+   void SetFramebufferResolution(const RESOLUTION_INFO &res) const;
+   void SetFramebufferResolution(int width, int height) const;
++  void GetMaxResolution(RESOLUTION_INFO &maxResolution);
+ 
+   std::string m_framebuffer_name;
++  RESOLUTION_INFO m_maxResolution;
+ };
diff --git a/projects/WeTek_Play/patches/kodi/0103-fix_compiler_badness_when_compiling_with_amcodec.patch b/projects/WeTek_Play/patches/kodi/0103-fix_compiler_badness_when_compiling_with_amcodec.patch
index cb2351b790..9a4becf019 100644
--- a/projects/WeTek_Play/patches/kodi/0103-fix_compiler_badness_when_compiling_with_amcodec.patch
+++ b/projects/WeTek_Play/patches/kodi/0103-fix_compiler_badness_when_compiling_with_amcodec.patch
@@ -1,8 +1,17 @@
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/Makefile.in b/xbmc/cores/dvdplayer/DVDCodecs/Video/Makefile.in
-index 8a97889..78506b1 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/Makefile.in
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/Makefile.in
-@@ -27,8 +27,6 @@ endif
+From 42e367deff4886e825dc77606c76ee9835d6ee9e Mon Sep 17 00:00:00 2001
+From: Stefan Saraev <stefan@saraev.ca>
+Date: Sat, 7 Nov 2015 16:24:58 +0200
+Subject: [PATCH] [wetek] fix compiler badness when compiling with amcodec
+
+---
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/Makefile.in | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/Makefile.in b/xbmc/cores/VideoPlayer/DVDCodecs/Video/Makefile.in
+index 7880c06..bd083b7 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/Makefile.in
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/Makefile.in
+@@ -25,8 +25,6 @@ endif
  ifeq (@USE_LIBAMCODEC@,1)
  SRCS += AMLCodec.cpp
  SRCS += DVDVideoCodecAmlogic.cpp
diff --git a/projects/WeTek_Play/patches/kodi/0105-perform_suspend_instead_of_powerdown.patch b/projects/WeTek_Play/patches/kodi/0105-perform_suspend_instead_of_powerdown.patch
index 07a504fe3d..d66d9f07a0 100644
--- a/projects/WeTek_Play/patches/kodi/0105-perform_suspend_instead_of_powerdown.patch
+++ b/projects/WeTek_Play/patches/kodi/0105-perform_suspend_instead_of_powerdown.patch
@@ -1,8 +1,18 @@
+From 55b3da7e97f6dc1e99724782253c35fe4c8b7c5a Mon Sep 17 00:00:00 2001
+From: Stefan Saraev <stefan@saraev.ca>
+Date: Sat, 7 Nov 2015 16:25:44 +0200
+Subject: [PATCH] [wetek] perform suspend instead of powerdown
+
+---
+ system/keymaps/keyboard.xml                        | 2 +-
+ xbmc/powermanagement/linux/LogindUPowerSyscall.cpp | 4 ++--
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
 diff --git a/system/keymaps/keyboard.xml b/system/keymaps/keyboard.xml
-index 45682a2..b8ce91b 100644
+index 9833cff..0ce105e 100644
 --- a/system/keymaps/keyboard.xml
 +++ b/system/keymaps/keyboard.xml
-@@ -96,7 +96,7 @@
+@@ -91,7 +91,7 @@
        <backslash>ToggleFullScreen</backslash>
        <home>FirstPage</home>
        <end>LastPage</end>
@@ -10,12 +20,12 @@ index 45682a2..b8ce91b 100644
 +      <power>XBMC.Powerdown()</power>
        <sleep>ActivateWindow(shutdownmenu)</sleep>
        <!-- PVR windows -->
-       <e>XBMC.ActivateWindowAndFocus(MyPVR, 31,0, 10,0)</e>
+       <e>ActivateWindow(TVGuide)</e>
 diff --git a/xbmc/powermanagement/linux/LogindUPowerSyscall.cpp b/xbmc/powermanagement/linux/LogindUPowerSyscall.cpp
-index 5a97fe6..369e790 100644
+index 4e5bcc6..ad5847d 100644
 --- a/xbmc/powermanagement/linux/LogindUPowerSyscall.cpp
 +++ b/xbmc/powermanagement/linux/LogindUPowerSyscall.cpp
-@@ -52,7 +52,7 @@ CLogindUPowerSyscall::CLogindUPowerSyscall()
+@@ -53,7 +53,7 @@ CLogindUPowerSyscall::CLogindUPowerSyscall()
    m_canPowerdown = LogindCheckCapability("CanPowerOff");
    m_canReboot    = LogindCheckCapability("CanReboot");
    m_canHibernate = LogindCheckCapability("CanHibernate");
@@ -24,7 +34,7 @@ index 5a97fe6..369e790 100644
  
    InhibitDelayLock();
  
-@@ -97,7 +97,7 @@ CLogindUPowerSyscall::~CLogindUPowerSyscall()
+@@ -98,7 +98,7 @@ CLogindUPowerSyscall::~CLogindUPowerSyscall()
  
  bool CLogindUPowerSyscall::Powerdown()
  {
diff --git a/projects/WeTek_Play/patches/kodi/1012-hide-meaningless-skips-from-users.patch b/projects/WeTek_Play/patches/kodi/1012-hide-meaningless-skips-from-users.patch
index cb240a4801..6a82519bec 100644
--- a/projects/WeTek_Play/patches/kodi/1012-hide-meaningless-skips-from-users.patch
+++ b/projects/WeTek_Play/patches/kodi/1012-hide-meaningless-skips-from-users.patch
@@ -1,22 +1,22 @@
-From a19578679f63520b0d8e9f589b82699076aff6ed Mon Sep 17 00:00:00 2001
-From: "Chris \"Koying\" Browet" <cbro@semperpax.com>
+From 0cdf5c043e783e8d956c5d59be8a9ec13a53fee3 Mon Sep 17 00:00:00 2001
+From: "Chris \\\"Koying\\\" Browet" <cbro@semperpax.com>
 Date: Fri, 19 Dec 2014 12:30:04 +0100
-Subject: [PATCH] FIX: [renderer] hide meaningless skips from users
+Subject: [PATCH] [aml] FIX: [renderer] hide meaningless skips from users
 
 ---
- xbmc/cores/VideoRenderers/RenderManager.cpp | 3 ++-
+ xbmc/cores/VideoPlayer/VideoRenderers/RenderManager.cpp | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)
 
-diff --git a/xbmc/cores/VideoRenderers/RenderManager.cpp b/xbmc/cores/VideoRenderers/RenderManager.cpp
-index 7f43949..01327f5 100644
---- a/xbmc/cores/VideoRenderers/RenderManager.cpp
-+++ b/xbmc/cores/VideoRenderers/RenderManager.cpp
-@@ -1198,7 +1198,8 @@ void CXBMCRenderManager::PrepareNextRender()
+diff --git a/xbmc/cores/VideoPlayer/VideoRenderers/RenderManager.cpp b/xbmc/cores/VideoPlayer/VideoRenderers/RenderManager.cpp
+index 70741be..15a484a 100644
+--- a/xbmc/cores/VideoPlayer/VideoRenderers/RenderManager.cpp
++++ b/xbmc/cores/VideoPlayer/VideoRenderers/RenderManager.cpp
+@@ -1421,7 +1421,8 @@ void CRenderManager::PrepareNextRender()
      while(m_queued.front() != idx)
      {
        requeue(m_discard, m_queued);
 -      m_QueueSkip++;
-+      if (m_format != RENDER_FMT_BYPASS)  // skips scares users ;)
++      if (m_format != RENDER_FMT_AML)  // skips scares users ;)
 +        m_QueueSkip++;
      }
  
diff --git a/tools/mkpkg/mkpkg_kodi b/tools/mkpkg/mkpkg_kodi
index 6ffdf8fcf2..ac03dc95db 100755
--- a/tools/mkpkg/mkpkg_kodi
+++ b/tools/mkpkg/mkpkg_kodi
@@ -58,13 +58,13 @@ echo "cleaning sources..."
   rm -rf $PKG_NAME-$PKG_VERSION/.git
 
 echo "seperating theme..."
-  rm -rf $PKG_NAME-theme-Confluence-$PKG_VERSION
-  mv $PKG_NAME-$PKG_VERSION/addons/skin.confluence $PKG_NAME-theme-Confluence-$PKG_VERSION
+  rm -rf $PKG_NAME-theme-Estuary-$PKG_VERSION
+  mv $PKG_NAME-$PKG_VERSION/addons/skin.estuary $PKG_NAME-theme-Estuary-$PKG_VERSION
 
 echo "packing sources..."
   tar cvJf $PKG_NAME-$PKG_VERSION.tar.xz $PKG_NAME-$PKG_VERSION
-  tar cvJf $PKG_NAME-theme-Confluence-$PKG_VERSION.tar.xz $PKG_NAME-theme-Confluence-$PKG_VERSION
+  tar cvJf $PKG_NAME-theme-Estuary-$PKG_VERSION.tar.xz $PKG_NAME-theme-Estuary-$PKG_VERSION
 
 echo "remove temporary sourcedir..."
   rm -rf $PKG_NAME-$PKG_VERSION
-  rm -rf $PKG_NAME-theme-Confluence-$PKG_VERSION
+  rm -rf $PKG_NAME-theme-Estuary-$PKG_VERSION