diff --git a/projects/RPi/patches/kodi/kodi-001-backport.patch b/projects/RPi/patches/kodi/kodi-001-backport.patch
index 08f5de4bc2..6a250524a2 100644
--- a/projects/RPi/patches/kodi/kodi-001-backport.patch
+++ b/projects/RPi/patches/kodi/kodi-001-backport.patch
@@ -1,7 +1,91 @@
-From 2ba9dbed84a444bc39a9d83d963e518239a2d8ec Mon Sep 17 00:00:00 2001
+From 6cebd3b7186d58ee1dd14263f532f9a8c6f005bd Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Tue, 28 Oct 2014 00:19:40 +0000
+Subject: [PATCH 01/75] [cec] Add settings for configuring button repeats
+
+---
+ addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
+ system/peripherals.xml                              |  4 +++-
+ xbmc/peripherals/devices/PeripheralCecAdapter.cpp   | 16 ++++++++++++++++
+ 3 files changed, 34 insertions(+), 1 deletion(-)
+
+diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
+index e0060d1fae556de529274dbc6be07455701573a3..6443f3dd885bf0aa8e031039e36e273972a310ae 100644
+--- a/addons/resource.language.en_gb/resources/strings.po
++++ b/addons/resource.language.en_gb/resources/strings.po
+@@ -19745,3 +19745,18 @@ msgstr ""
+ msgctxt "#39010"
+ msgid "Select sort method"
+ msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38050"
++msgid "Remote button press delay before repeating (ms)"
++msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38051"
++msgid "Remote button press repeat rate (ms)"
++msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38052"
++msgid "Remote button press release time (ms)"
++msgstr ""
+diff --git a/system/peripherals.xml b/system/peripherals.xml
+index d5704b249c3065b2980dc92c7c81dc7b384187bc..02b1a9ed6fce1986bd864bba09a9df0621f9e041 100644
+--- a/system/peripherals.xml
++++ b/system/peripherals.xml
+@@ -31,7 +31,9 @@
+     <setting key="device_type" type="int" value="1" configurable="0" />
+     <setting key="wake_devices_advanced" type="string" value="" configurable="0" />
+     <setting key="standby_devices_advanced" type="string" value="" configurable="0" />
+-    <setting key="double_tap_timeout_ms" type="int" min="0" value="300" configurable="0" />
++    <setting key="double_tap_timeout_ms" type="int" min="50" max="1000" step="50" value="300" label="38050" order="16" />
++    <setting key="button_repeat_rate_ms" type="int" min="0" max="250" step="10" value="0" label="38051" order="17" />
++    <setting key="button_release_delay_ms" type="int" min="0" max="500" step="50" value="0" label="38052" order="18" />
+   </peripheral>
+ 
+   <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
+diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+index d032ffd707fee5eec035e90bdf618530f7215c37..30367a3fde956090afdca9930fa52e829f35046f 100644
+--- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
++++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+@@ -1296,6 +1296,20 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
+   m_configuration.bActivateSource = config.bActivateSource;
+   bChanged |= SetSetting("activate_source", m_configuration.bActivateSource == 1);
+ 
++#if defined(CEC_DOUBLE_TAP_TIMEOUT_MS_OLD)
++  m_configuration.iDoubleTapTimeout50Ms = config.iDoubleTapTimeout50Ms;
++  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeout50Ms * 50);
++#else
++  m_configuration.iDoubleTapTimeoutMs = config.iDoubleTapTimeoutMs;
++  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeoutMs);
++#endif
++
++  m_configuration.iButtonRepeatRateMs = config.iButtonRepeatRateMs;
++  bChanged |= SetSetting("button_repeat_rate_ms", (int)m_configuration.iButtonRepeatRateMs);
++
++  m_configuration.iButtonReleaseDelayMs = config.iButtonReleaseDelayMs;
++  bChanged |= SetSetting("button_release_delay_ms", (int)m_configuration.iButtonReleaseDelayMs);
++
+   m_configuration.bPowerOffOnStandby = config.bPowerOffOnStandby;
+ 
+   m_configuration.iFirmwareVersion = config.iFirmwareVersion;
+@@ -1398,6 +1412,8 @@ void CPeripheralCecAdapter::SetConfigurationFromSettings(void)
+   // backwards compatibility. will be removed once the next major release of libCEC is out
+   m_configuration.iDoubleTapTimeoutMs = GetSettingInt("double_tap_timeout_ms");
+ #endif
++  m_configuration.iButtonRepeatRateMs = GetSettingInt("button_repeat_rate_ms");
++  m_configuration.iButtonReleaseDelayMs = GetSettingInt("button_release_delay_ms");
+ 
+   if (GetSettingBool("pause_playback_on_deactivate"))
+   {
+
+From 0fdeeb63794764ebdd628e52d170bf8bac330efd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Apr 2014 17:27:52 +0100
-Subject: [PATCH 02/71] [cec] Don't suspend pi on tv switch off - it can't wake
+Subject: [PATCH 02/75] [cec] Don't suspend pi on tv switch off - it can't wake
  up
 
 ---
@@ -22,10 +106,10 @@ index 02b1a9ed6fce1986bd864bba09a9df0621f9e041..54f9b70cfd5c8c82ceb99932e1b3e325
      <setting key="use_tv_menu_language" type="bool" value="1" label="36018" order="10" />
      <setting key="pause_playback_on_deactivate" type="bool" value="1" label="36033" configurable="0" />
 
-From 936c12492b75b00bc991b1fbc0bfc740a099206c Mon Sep 17 00:00:00 2001
+From 36f4544b7ac9c810c875e8ae19ab92b3f3dafb59 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Apr 2014 18:19:32 +0100
-Subject: [PATCH 03/71] [rbp/omxplayer] When opening a stream don't try to
+Subject: [PATCH 03/75] [rbp/omxplayer] When opening a stream don't try to
  update gui so often
 
 ---
@@ -49,10 +133,10 @@ index c8fe0706d128b3c67a4000894129ae0fa08bb223..8a5916299575661743131b921a27a76f
          dialog->ProcessRenderLoop(false);
          if (allowCancel && dialog->IsCanceled())
 
-From d557ef01432ab1b17a41ecf339259c4c2a95a58e Mon Sep 17 00:00:00 2001
+From 2be0471046b5e75078f1a284348b3d2fbd033555 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 8 Mar 2014 15:36:06 +0000
-Subject: [PATCH 04/71] [hifiberry] Hack: force it to be recognised as IEC958
+Subject: [PATCH 04/75] [hifiberry] Hack: force it to be recognised as IEC958
  capable to enable passthrough options
 
 ---
@@ -75,10 +159,10 @@ index d66993a09583d8f9f54f5f97c18fbba45dddee9b..3c0b691860ace57e0a25f01013df01a5
          info.m_displayName.substr(info.m_displayName.size()-5) == " HDMI")
      {
 
-From 243a6a522a997e5502dfc644415352bab635e26e Mon Sep 17 00:00:00 2001
+From e2b718b239b65f2132406355dfdf9c66da744b9c Mon Sep 17 00:00:00 2001
 From: Ben Avison <bavison@riscosopen.org>
 Date: Thu, 1 May 2014 16:28:39 +0100
-Subject: [PATCH 05/71] Improved file buffering in CArchive
+Subject: [PATCH 05/75] Improved file buffering in CArchive
 
 Even though memcpy is typically inlined by the compiler into byte/word loads
 and stores (at least for release builds), the frequency with which 1, 2 and 4
@@ -138,10 +222,10 @@ index 23cac2759fb10d532da56fa75c5528c5589e9010..89d31d4db1afa7340ed8cd51a7a9fa7a
      }
  
 
-From b5f43e1c7e25eb7ddfc72def17fa6ce252febf57 Mon Sep 17 00:00:00 2001
+From e59492cefc6ebc66027e7fb96475f14ad14a650c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 10 Aug 2014 16:45:16 +0100
-Subject: [PATCH 06/71] filesystem: Make support of browsing into archives
+Subject: [PATCH 06/75] filesystem: Make support of browsing into archives
  optional
 
 The ability to browse, scan and play content in archives can cause problems on low powered/low memory devices.
@@ -251,10 +335,10 @@ index a0fd0a9011e71f4af1535110c696b6ea5c4b37db..688b71a297c7c617c6764bfe6be157d7
    {
      CURL xbtUrl = URIUtils::CreateArchivePath("xbt", url);
 
-From 199df29247e2fa52ad74270db9496bb816e955aa Mon Sep 17 00:00:00 2001
+From 73698542aed16c452fc15f5cd5a438e127676b68 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 13:06:57 +0000
-Subject: [PATCH 07/71] [rbp] Make cachemembuffersize default depend on memory
+Subject: [PATCH 07/75] [rbp] Make cachemembuffersize default depend on memory
  size
 
 ---
@@ -356,10 +440,10 @@ index 91574029c28c4fabacb4bc022aa028dcaf299adb..46d72aa072d34119f4a7273dc8f71176
  }
  
 
-From 60d038a72dc2787e87e78241da4293c40c6e8be0 Mon Sep 17 00:00:00 2001
+From 48eb57a16b9d386dc54b42ab04700f8f7f85fab9 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 30 May 2014 14:58:43 +0100
-Subject: [PATCH 08/71] [settings] Experiment: Report DESKTOP resolution in
+Subject: [PATCH 08/75] [settings] Experiment: Report DESKTOP resolution in
  video settings
 
 ---
@@ -381,10 +465,10 @@ index ef95bc286fa982790248bad26da3c3e00c1da002..da69c6960867621d4ebe9267929664d9
          StringUtils::Format("%dx%d%s", resolution->width, resolution->height,
                              ModeFlagsToString(resolution->flags, false).c_str()),
 
-From 339e21959b9c8cb48571e1cc17d14c83240043ab Mon Sep 17 00:00:00 2001
+From 952474c036385667d8ec894c178f58490af6f69c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 24 Sep 2014 23:13:52 +0100
-Subject: [PATCH 09/71] [audio] Add settings option to boost centre channel
+Subject: [PATCH 09/75] [audio] Add settings option to boost centre channel
  when downmixing
 
 This allows a dB volume increase to be added to centre channel.
@@ -510,10 +594,10 @@ index f16b822ed7b4aebe18b5d339b3f71ee66e97c23f..993d4b33a294e88c2c004b7943895ba5
      // stereo upmix
      if (upmix && m_src_channels == 2 && m_dst_channels > 2)
 
-From ab24bce0380f80ad67a895d0fb76b65915712a1a Mon Sep 17 00:00:00 2001
+From 1296ca8ae16f160bd8bdf00491582f94577122c5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 15:23:51 +0000
-Subject: [PATCH 10/71] [rbp] Default extract thumbnails to false
+Subject: [PATCH 10/75] [rbp] Default extract thumbnails to false
 
 It can take 80 seconds for a single file on a Pi. It can cause crashes with out-of-memory errors.
 It genereates a lot of support issues. Best to default to disabled and let users enable it if they must
@@ -539,10 +623,10 @@ index e8b0d3d472b02fd161a4b51e957b9129e3cb9792..289dc55ec41aa44848519a05f8ee1ccc
      </category>
    </section>
 
-From 1f2afb5a24ad283a0113d138efaafa05a8c983c3 Mon Sep 17 00:00:00 2001
+From 221907efb819c990488518eb9c4b7cfd91151e4e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 27 Nov 2014 16:31:56 +0000
-Subject: [PATCH 11/71] [languageinvoker] Reduce priority of python threads
+Subject: [PATCH 11/75] [languageinvoker] Reduce priority of python threads
 
 ---
  xbmc/interfaces/generic/LanguageInvokerThread.cpp | 5 +++++
@@ -565,10 +649,10 @@ index fcdd0633f30cd9595ae6cc4ed293677cdcb1f422..16f0c8916b5e0a9e90973d194cf2ebd1
  }
  
 
-From 275bea7284b6f326aff83305d9614ca8963d745b Mon Sep 17 00:00:00 2001
+From cf222655784da191a022a153fa5614cfbb4d79bd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 29 Nov 2014 15:25:16 +0000
-Subject: [PATCH 12/71] [rbp] hack: wait for splash to complete before changing
+Subject: [PATCH 12/75] [rbp] hack: wait for splash to complete before changing
  hdmi mode
 
 ---
@@ -652,10 +736,10 @@ index ee297700f8583dbb15cbe53baf8c887b36bd2ea0..bbe501d40c5e101f1d0d64b8b59b1928
  
    RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
 
-From 4b9d00125907996bb8db50765d6019aecc06d494 Mon Sep 17 00:00:00 2001
+From 7c77d589e065637bb0644889b520f3902b44b880 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 11 Dec 2014 17:00:57 +0000
-Subject: [PATCH 13/71] Fix for UI not showing both extractflags and
+Subject: [PATCH 13/75] Fix for UI not showing both extractflags and
  extractthumb
 
 ---
@@ -718,10 +802,10 @@ index 5f1f3ca48342ef1a4eeed7432221d7b2dda354e8..2ed5fb217c6b9f63f28d760e2a2c00b2
            <control type="toggle" />
          </setting>
 
-From 289cdf24dcdd94f577248862ca438f672302b936 Mon Sep 17 00:00:00 2001
+From 9e7d22b484cbccf5d54293a36c3cae38ce7426dd Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Thu, 11 Sep 2014 21:30:43 +0200
-Subject: [PATCH 14/71] Disable autoscrolling while on screensaver and while
+Subject: [PATCH 14/75] Disable autoscrolling while on screensaver and while
  opening streams.
 
 ---
@@ -734,10 +818,10 @@ Subject: [PATCH 14/71] Disable autoscrolling while on screensaver and while
  6 files changed, 24 insertions(+), 3 deletions(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index b8ff91b427c4fd430675aab3d1d93098c976031f..fdf7b1dc04e31ffe8e1d1b83825343b24c645b02 100644
+index 947f0937d73cde5e4a4f39ed1a7932bd1e8eb0fe..593acafd15bb0409b4446b6e598f7aa4d7baf434 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -5229,3 +5229,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
+@@ -5232,3 +5232,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
    
    return false;
  }
@@ -852,10 +936,10 @@ index d7bc1c5ba6067af9a460589920367288c640a915..ac766293f1c47c7f145cb46f6b152144
        if (m_lastRenderTime)
          m_autoScrollDelayTime += currentTime - m_lastRenderTime;
 
-From 3e7e8ad0f181636081a5844c4cbc81c4db8e2c64 Mon Sep 17 00:00:00 2001
+From 831794fa04a8589069317953f813ada9f0d3bf54 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 13 Dec 2014 18:35:20 +0000
-Subject: [PATCH 15/71] [demuxer] Avoid memcpy on every demuxer packet
+Subject: [PATCH 15/75] [demuxer] Avoid memcpy on every demuxer packet
 
 Avoids an unnecessary memcpy on every demuxer packet which for
 high bitrate videos can be significant.
@@ -955,10 +1039,10 @@ index df0f35bd49c65b302de4ccd110d859e8b881ea5f..b4b591ae4c4dd4fb0b36d4d00fedca96
      }
      catch(...) {
 
-From 5d25db06c31809475a48876f9de7e0875fdaf1f4 Mon Sep 17 00:00:00 2001
+From 9673bb4533c0a82f4712752b6f6d28f5f1ceb24e Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Wed, 25 Feb 2015 18:22:21 +0100
-Subject: [PATCH 16/71] Load OSD dialogs on startup.
+Subject: [PATCH 16/75] Load OSD dialogs on startup.
 
 Fixes skipped frames the first time they're loaded in memory on less powered
 devices, like a Raspberry Pi, when using DVDPlayer.
@@ -1053,10 +1137,10 @@ index 0534828dd85520134f7a6890e43a873e223062c1..5a86dfc1e2a54c8fe8d82cb75b612d8e
  CGUIDialogVideoSettings::~CGUIDialogVideoSettings()
  { }
 
-From a7074f84d7adec4ec29679bb9ecf84d82f3db69c Mon Sep 17 00:00:00 2001
+From 19b2018244c328f5f88f90271e31de66bea486e3 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Apr 2015 20:51:14 +0100
-Subject: [PATCH 17/71] [gui] Also limit GUI updates when in non full-screen
+Subject: [PATCH 17/75] [gui] Also limit GUI updates when in non full-screen
  video mode
 
 ---
@@ -1064,7 +1148,7 @@ Subject: [PATCH 17/71] [gui] Also limit GUI updates when in non full-screen
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index fdf7b1dc04e31ffe8e1d1b83825343b24c645b02..513deb7f27846891fb875b9263ad4d61752519ef 100644
+index 593acafd15bb0409b4446b6e598f7aa4d7baf434..f9aed6476b069ccf391697642e7999ea61b2ddcc 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
 @@ -2771,7 +2771,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
@@ -1086,10 +1170,10 @@ index fdf7b1dc04e31ffe8e1d1b83825343b24c645b02..513deb7f27846891fb875b9263ad4d61
      g_windowManager.FrameMove();
    }
 
-From ce41575e8ece9bef487b338b838520407e59ee8d Mon Sep 17 00:00:00 2001
+From b7e74e740581f7e6ab94609171000b747da9c911 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 5 May 2015 23:58:06 +0100
-Subject: [PATCH 18/71] [screensaver] Leave GUI contents available for
+Subject: [PATCH 18/75] [screensaver] Leave GUI contents available for
  screensaver
 
 ---
@@ -1119,10 +1203,10 @@ index 5808f7ed1e94d68ead7305ba6d284edd4df12bdd..2a3b7f16531c9822e79c77efabdd30ac
  
    // Add window to the history list (we must do this before we activate it,
 
-From 27a497a156eeb9fd7f589f3b121ea5fde3c4d47a Mon Sep 17 00:00:00 2001
+From fe4cef6b6e2a35352ede135ac84ff3539d1ff09e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 6 Jun 2015 18:43:57 +0100
-Subject: [PATCH 19/71] ffmpeg: Automatic switch to software decode for GMC
+Subject: [PATCH 19/75] ffmpeg: Automatic switch to software decode for GMC
  with more than one warp point
 
 ---
@@ -1350,10 +1434,10 @@ index f135d423c0ca76fd70e79ae5b7d035f0cb79fc75..d9b576bc46055fdab1c134e5f2c63cd4
        else if ((hint.codec == AV_CODEC_ID_VC1 || hint.codec == AV_CODEC_ID_WMV3) && g_RBP.GetCodecWvc1())
          supported = true;
 
-From e7c0b0639f3216a4ec732453eb3869221035c0c8 Mon Sep 17 00:00:00 2001
+From f5dabe10623f19cd9e8ea015e2d248d47c03900c Mon Sep 17 00:00:00 2001
 From: Claudio-Sjo <Claudio.Porfiri@gmail.com>
 Date: Mon, 16 Feb 2015 14:51:26 +0100
-Subject: [PATCH 20/71] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
+Subject: [PATCH 20/75] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
  - fixes #15794
 
 ---
@@ -1545,10 +1629,10 @@ index 0427af4534bfe59a343f0518c7f4242d93299836..e99236294fa8b9b613e465a8ecaf3ad3
    lsn_t m_lsnCurrent; // Position inside the track in logical sector number
    lsn_t m_lsnEnd;   // End of m_iTrack in logical sector number
 
-From ccee8a7820c164e8eef572e2c4940f407992daad Mon Sep 17 00:00:00 2001
+From 9e3b4fd8c161b01d324220252289a5b3a49fb7e8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 24 Jun 2016 19:38:13 +0100
-Subject: [PATCH 21/71] codecoverlay: Include codec name in overlay
+Subject: [PATCH 21/75] codecoverlay: Include codec name in overlay
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp | 4 ++++
@@ -1642,10 +1726,10 @@ index 0df7e72cc9d1947173c2bac5e72eb09976b51aa5..b5050081c360d29b1b478c27e6b88291
    double                    m_iSubtitleDelay;
    bool                      m_bRenderSubs;
 
-From 97a606afbd7b6502cc23b64568df6e93de332bdf Mon Sep 17 00:00:00 2001
+From 119f7291d3b7c1a57d3a86b3836c8a73a7cd1211 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 8 Mar 2016 21:20:58 +0300
-Subject: [PATCH 22/71] [DebugInfo] Add cpu usage info.
+Subject: [PATCH 22/75] [DebugInfo] Add cpu usage info.
 
 ---
  .../VideoPlayer/VideoRenderers/DebugRenderer.cpp   | 56 ++++++++--------------
@@ -1815,10 +1899,10 @@ index 420b5b5d8e6089e1049ef9af25e23d915df50dc1..fd8a0a2447c40357a9e13003f2ef45ef
  
        m_debugTimer.Set(1000);
 
-From 65148f02888c16e3ae05b0639ddd6a753ae261fa Mon Sep 17 00:00:00 2001
+From 21927619971ef137030d64a0dd102a90a7effaf0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 22 May 2015 13:56:29 +0100
-Subject: [PATCH 23/71] ffmpeg: Allow neon to be enabled in unified builds
+Subject: [PATCH 23/75] ffmpeg: Allow neon to be enabled in unified builds
 
 ---
  tools/depends/target/ffmpeg/Makefile | 4 ++++
@@ -1841,10 +1925,10 @@ index 8dd14cdfd053f142f386b6dee1fc0b21bb1f8d93..b5f38a458dfb341c43089e07afded153
  ifeq ($(OS), linux)
    ffmpg_config += --target-os=$(OS) --cpu=$(CPU)
 
-From 7b3ac50fec10531c959fa94e92e3a2c6be0b8789 Mon Sep 17 00:00:00 2001
+From 7c9767ac163fada0423cf8cc27b05f0d74482220 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 27 Feb 2015 14:37:27 +0000
-Subject: [PATCH 24/71] ffmpeg: Add some upstream HEVC optimisations
+Subject: [PATCH 24/75] ffmpeg: Add some upstream HEVC optimisations
 
 ---
  tools/depends/target/ffmpeg/Makefile               |    6 +-
@@ -5642,10 +5726,10 @@ index 0000000000000000000000000000000000000000..5e8e07d407f045fc99554f0f061d1e81
 +2.5.0
 +
 
-From 3e411c507b6f607fff9b05dc6a2f041d2a8ef986 Mon Sep 17 00:00:00 2001
+From f15eaf9000104c97d5bfc5ea046b4407cab2a261 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 14:04:18 +0100
-Subject: [PATCH 25/71] [ffmpeg] Add GPU acceleration to hevc
+Subject: [PATCH 25/75] [ffmpeg] Add GPU acceleration to hevc
 
 ---
  tools/depends/target/ffmpeg/Makefile               |     4 +-
@@ -43831,10 +43915,10 @@ index 0000000000000000000000000000000000000000..e172ebf157aebffe1ae50b4a2b25fd71
 +2.7.4
 +
 
-From 7d7851a6a6201afea2c705d6ab30494a48006f2d Mon Sep 17 00:00:00 2001
+From 88b331888a7677058bb3dfb064d7eb952b0ce1a9 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 12 Jan 2016 16:29:57 +0000
-Subject: [PATCH 26/71] ffmpeg: Add cabac opimisations for hevc
+Subject: [PATCH 26/75] ffmpeg: Add cabac opimisations for hevc
 
 ---
  .../0001-Squashed-commit-of-the-following.patch    | 2179 ++++++++++++++++++++
@@ -46079,10 +46163,10 @@ index d6856dbd4fb4957ace700cbc08332223c01938f6..a61357f14cb2139e8125ae04684bed1b
  
  make -j ${BUILDTHREADS} 
 
-From de9212a260d82d7ce6584bf11adde8aa7b9035e9 Mon Sep 17 00:00:00 2001
+From ce532b19d18df015cecb0e2e2ec85f0c89885a25 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 16 Sep 2015 19:05:12 +0100
-Subject: [PATCH 27/71] [3d] Make MVC a valid 3D filename tag
+Subject: [PATCH 27/75] [3d] Make MVC a valid 3D filename tag
 
 ---
  xbmc/guilib/StereoscopicsManager.cpp | 9 +++++++++
@@ -46143,10 +46227,10 @@ index fc526d11c3a78bc74125429120e29bf295bd3b16..6b0e3b8cf9e3ff40e6af758c54fe7eef
      bool m_useDisplayControlHWStereo;
  
 
-From 438d9a918515eba692999e310cecf2816bd68b8d Mon Sep 17 00:00:00 2001
+From df4fc81637ca4b47d4ce0e64110d8bab4bd77cd4 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 5 Oct 2015 14:58:05 +0100
-Subject: [PATCH 28/71] [3d] Swap top/bottom sides of GUI
+Subject: [PATCH 28/75] [3d] Swap top/bottom sides of GUI
 
 ---
  xbmc/guilib/GraphicContext.cpp | 2 +-
@@ -46166,10 +46250,10 @@ index 3706e4d80b3b31da4c5be0a1b21f36e59d2910f2..e170b3fb05279ffa316794dbce1d4f9d
    }
    if(m_stereoMode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
 
-From 5b43a704acfb943c6010d109418779da3a7febda Mon Sep 17 00:00:00 2001
+From 2373df61c862bc62538391596c098a80968d1c0d Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 11 Oct 2015 20:51:37 +0100
-Subject: [PATCH 29/71] Revert "Revert "Disable extra logging by default""
+Subject: [PATCH 29/75] Revert "Revert "Disable extra logging by default""
 
 This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
 ---
@@ -46196,10 +46280,10 @@ index 2ed5fb217c6b9f63f28d760e2a2c00b29942315a..850abcd174cc8773319639c7e337f2e2
              <options>loggingcomponents</options>
              <delimiter>,</delimiter>
 
-From ba0799b8adc0eba075b0b90c86f6670398b65f45 Mon Sep 17 00:00:00 2001
+From a0543043a26699a0e4a8bed989481ab1320e3f0c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 21 Dec 2015 22:17:25 +0000
-Subject: [PATCH 30/71] [omximage] Fall back to arm jpeg encode/decode when gpu
+Subject: [PATCH 30/75] [omximage] Fall back to arm jpeg encode/decode when gpu
  is busy
 
 ---
@@ -46442,10 +46526,10 @@ index a93aa82663903fb1bf712058c2e259290ee742e6..6f38dbc7e5cc721c59a3633935f08218
  
  extern COMXImage g_OMXImage;
 
-From 8d5f8aa788c54c4cf0d1f448d322217b94d7eb29 Mon Sep 17 00:00:00 2001
+From 72ad7c69c3f847ade231f29ac23ffb96ebaf2ae4 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 9 Dec 2015 13:31:14 +0000
-Subject: [PATCH 31/71] [mmalcodec] Fail to open when width is invalid. Can
+Subject: [PATCH 31/75] [mmalcodec] Fail to open when width is invalid. Can
  happen with mpegts files
 
 ---
@@ -46467,10 +46551,10 @@ index 822b7bf75f2e732b5eed8687403d0eda503fa641..c43952d4d29b42f3a5c7605573294568
    if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
      return false;
 
-From 6b2cc20d5a1733f1bd97b46bf938b9b57904ac2c Mon Sep 17 00:00:00 2001
+From 0e735b38e2891c582c5a37dc5ded26cb954948a8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 Sep 2014 11:54:49 +0100
-Subject: [PATCH 32/71] [videoplayer/rbp] Add pi specific option to maintain
+Subject: [PATCH 32/75] [videoplayer/rbp] Add pi specific option to maintain
  vsync with pll adjustment
 
 New A/V sync option in settings/video/playback to do "Adjust PLL".
@@ -46911,10 +46995,10 @@ index fffa5182126159f6dfcf750b21fa0464e229e545..815d758e7086d73b4d4eb16849fdbb50
  
  extern CRBP g_RBP;
 
-From 57021f87ad5adaa6d559a5a59e4f07469289f578 Mon Sep 17 00:00:00 2001
+From d4a5c46043ced09c53dea24e6ca090a574806e3b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 15:35:43 +0100
-Subject: [PATCH 33/71] rbp: Support zero copy interface with hevc acceleration
+Subject: [PATCH 33/75] rbp: Support zero copy interface with hevc acceleration
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 9 +++++++++
@@ -46958,10 +47042,10 @@ index 77ae3273bc8e224fe6c193300ccef32fb7fbafe1..c0b3f19f2ef9cdef9adf00cf81154803
    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
      CLog::Log(LOGDEBUG, "%s::%s - mmal:%p dts:%.3f pts:%.3f buf:%p gpu:%p", CLASSNAME, __FUNCTION__, picture->MMALBuffer->mmal_buffer, 1e-6*picture->dts, 1e-6*picture->pts, picture->MMALBuffer, gmem);
 
-From b193395bcc84c1954a89811d565bfac787e6315a Mon Sep 17 00:00:00 2001
+From 0fbf365c6de020f0d094c8ab221b159593eecce5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 16 May 2015 18:26:04 +0100
-Subject: [PATCH 34/71] ffmpeg: use upstream mvc patches
+Subject: [PATCH 34/75] ffmpeg: use upstream mvc patches
 
 ---
  ...vcodec-add-h264_mvc-codec-id-and-profiles.patch |  68 ++++++++++++
@@ -47271,10 +47355,10 @@ index 0000000000000000000000000000000000000000..b39480ad098b9cd0882fcf75b96afb1b
 +2.7.4
 +
 
-From 2270fbeb9d9d858e15d77347f50e4813c75d4aff Mon Sep 17 00:00:00 2001
+From f303faf857227cee88db21f5e95bd0a7d2f8c06e Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 29 Jan 2016 17:18:50 +0300
-Subject: [PATCH 35/71] [win32] Settings: Added setting to enable/disable MVC
+Subject: [PATCH 35/75] [win32] Settings: Added setting to enable/disable MVC
  decoder.
 
 ---
@@ -47304,10 +47388,10 @@ index a017d30c24232fb01220b87b29398403b8ed9662..2fcee72a64e8b701c8e895143410bbe9
      <category id="display">
        <group id="1">
 
-From 2a51cc049289ca6c012ce2f09313ca13266fc37e Mon Sep 17 00:00:00 2001
+From 9f1937bc8941347695d09078e624cc30beab4a6d Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Wed, 20 Jan 2016 17:02:16 +0300
-Subject: [PATCH 36/71] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
+Subject: [PATCH 36/75] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
  streams.
 
 ---
@@ -47370,10 +47454,10 @@ index 54a18c669a058b705e0276cb7e14522ae6cd04ae..55431978dcfabee8da95e2e76292ff81
        }
      case AVMEDIA_TYPE_DATA:
 
-From 6a2a77a44d394e51330f10da92d2989171ff99b3 Mon Sep 17 00:00:00 2001
+From a451efc2d79422565ef1cbf931444c3ef5165125 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 25 Feb 2016 11:21:25 +0300
-Subject: [PATCH 37/71] [Stereo3D] Added block_lr and block_rl to supported
+Subject: [PATCH 37/75] [Stereo3D] Added block_lr and block_rl to supported
  modes.
 
 ---
@@ -47423,10 +47507,10 @@ index 1443acaf0f25df458ae49766e13dd0323454f2eb..6aaa82f4d883b8cae0ccdedf6c5a6814
      i++;
    }
 
-From a27d42b9f6b66f08be5561f6224ffb5af56fe38c Mon Sep 17 00:00:00 2001
+From 39522c63603fb5bf00b95a0eba5df6a626ea240f Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Sat, 23 Jan 2016 10:21:32 +0300
-Subject: [PATCH 38/71] [VideoPlayer] Fix possible wrong aspect.
+Subject: [PATCH 38/75] [VideoPlayer] Fix possible wrong aspect.
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerVideo.cpp | 2 +-
@@ -47446,10 +47530,10 @@ index 903f0d83527d9088ff1bf0ba056f357f6abfda81..a5a33d34c70892cde77ad4d8f3cb65fd
    else
      m_fForcedAspectRatio = 0.0;
 
-From 61f6644450fcca90960efbdfcbb619d79b46772f Mon Sep 17 00:00:00 2001
+From b362a9d5e20db180bc6fce923188a921e7a0e985 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 22 Jan 2016 18:18:33 +0300
-Subject: [PATCH 39/71] [VideoPlayer] DemuxFFmpeg: ssif remux
+Subject: [PATCH 39/75] [VideoPlayer] DemuxFFmpeg: ssif remux
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/CMakeLists.txt  |   2 +
@@ -47883,10 +47967,10 @@ index cca5c7f932241d146291d2bb0a0042f99fa0d596..edbc96f7be3ae4dae994320f8c137555
    m_discStubExtensions = ".disc";
    // internal music extensions
 
-From 8e01d2a2a958030fae5173fbcf6a14c8ae1997c6 Mon Sep 17 00:00:00 2001
+From 0bd2f0f4af5d90cd685380e36379590a378d024d Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:02:46 +0300
-Subject: [PATCH 40/71] [3DBD] Added support of 3D-BluRay playback.
+Subject: [PATCH 40/75] [3DBD] Added support of 3D-BluRay playback.
 
 ---
  lib/DllLibbluray.h                                 |   8 +
@@ -48876,10 +48960,10 @@ index b967a85e6557e42a7f1235cdd804d5a0263b866f..561fb5cd4f971bc9ee4f41218a60bb3d
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 0e7e3baf46d1c699dd14b492d81cd11ec656fe69 Mon Sep 17 00:00:00 2001
+From 913cd365b12a9730cb04bb8a9d5ebddde02d5503 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 11 Mar 2016 16:58:53 +0300
-Subject: [PATCH 41/71] [VideoPlayer] HasVideo returns true if video stream
+Subject: [PATCH 41/75] [VideoPlayer] HasVideo returns true if video stream
  exists. This don't allow start visualization if audio is opened before video.
 
 ---
@@ -48887,7 +48971,7 @@ Subject: [PATCH 41/71] [VideoPlayer] HasVideo returns true if video stream
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index f206847aa8bd9e57c9e558362ef0728fd7737efd..b533aa5395dac512d3b153b44b86d2fa7276ddb2 100644
+index 0285de264b4abc9433d70ae056b80c3db4b318c9..b244a21ac083c6f7b0e2d455e2b7a45fb2497640 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 @@ -3074,7 +3074,7 @@ void CVideoPlayer::Pause()
@@ -48900,10 +48984,10 @@ index f206847aa8bd9e57c9e558362ef0728fd7737efd..b533aa5395dac512d3b153b44b86d2fa
  
  bool CVideoPlayer::HasAudio() const
 
-From 8615e56935a181d5c85e56ca16854f197f5a39cd Mon Sep 17 00:00:00 2001
+From e8a09603950b958dd1934cb460fda960759485f8 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 42/71] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 42/75] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -48940,10 +49024,10 @@ index 3a080d06c90b0762482816928642e6de7810b539..a8323f419e404037c4e5fb4d78fa1b45
      CDVDOverlayImage* overlay = new CDVDOverlayImage();
  
 
-From 6b3a976f6e558e23d9561ea37ac5e5e59eb5b801 Mon Sep 17 00:00:00 2001
+From f10689878e33dc69a2ebbd559f41de12e72784c5 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 7 Apr 2016 17:28:50 +0300
-Subject: [PATCH 43/71] [VideoPlayer] Disable reading extension stream from
+Subject: [PATCH 43/75] [VideoPlayer] Disable reading extension stream from
  input stream if decoder doesn't support it.
 
 ---
@@ -49173,7 +49257,7 @@ index 0b676c9b611fe956f1aa721013412e41ff5b62f6..6762e733848d1298a75a862b0aaf81aa
  
  class CDVDAudioCodec;
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index b533aa5395dac512d3b153b44b86d2fa7276ddb2..505747a6d7c45c0a4e67fefa711c85dd5236e35d 100644
+index b244a21ac083c6f7b0e2d455e2b7a45fb2497640..69b031a5623888a1b9a8c0ca7fe34fe3b1900fdc 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 @@ -3802,6 +3802,10 @@ bool CVideoPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
@@ -49200,10 +49284,10 @@ index 0d4100e58e9db7e5035bcf9ae23b0147f80cec8f..69570153f0810a5840f3780c7a6681a1
    // classes
    CDVDOverlayContainer* m_pOverlayContainer;
 
-From fbe74a1f5eabeba77ead6a05a30a2c4e2b2ca283 Mon Sep 17 00:00:00 2001
+From 74d399ad03a76c6f63c4fab2ba8ba2760a2f2180 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 16 Sep 2016 11:37:48 +0300
-Subject: [PATCH 44/71] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
+Subject: [PATCH 44/75] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
  platform settings to common settings.
 
 ---
@@ -49299,10 +49383,10 @@ index 473ca093f45f6a5779cade1268269bb7ba483e9d..11a422b1a5cbfde9914d3bfd23b5b540
    m_simpleConditions.insert("have_lcms2");
  #endif
 
-From 79606a9af5952398a535b203d34eab88189b75bf Mon Sep 17 00:00:00 2001
+From 1f0f86550e8cfed2a5de0d436c5c1e1e2ea642a1 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 4 Nov 2016 22:56:56 +0300
-Subject: [PATCH 45/71] [VideoPlayer] SSIF: fix for corner case when mvc stream
+Subject: [PATCH 45/75] [VideoPlayer] SSIF: fix for corner case when mvc stream
  is switched before the last packet is read from previous stream.
 
 ---
@@ -49491,33 +49575,33 @@ index f70657c9e31fb2460d12910c635dba5163282e74..a11ec77903d2a9b2c68106a8e2301af9
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 9134b76ed4f3e94794b24624b5251d03c57c2d16 Mon Sep 17 00:00:00 2001
+From ddc42633af64cfc6e9447d40f988c86a9a04250d Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:01:08 +0300
-Subject: [PATCH 46/71] [libbluray] bump libbluray to 0.9.2-mvc.
+Subject: [PATCH 46/75] [libbluray] bump libbluray to 0.9.2-mvc.
 
 ---
  project/BuildDependencies/scripts/0_package.list | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/project/BuildDependencies/scripts/0_package.list b/project/BuildDependencies/scripts/0_package.list
-index 33f87aec9968a24f5c9ba35ab5ea2eb1786feaf9..4fdebd9401b40ca18a474aece3e27f4b696c9d5b 100644
+index 6f53a2785027cf6c34d084402f3f1aee7cf5860a..e4a67e91b0a6b9fafad972b0f6f8e86c619c436f 100644
 --- a/project/BuildDependencies/scripts/0_package.list
 +++ b/project/BuildDependencies/scripts/0_package.list
 @@ -17,7 +17,7 @@ freetype-db5a22-win32-vc140.7z
  giflib-5.1.4-win32-vc140.7z
  jsonschemabuilder-1.0.0-win32-3.7z
- libass-ddb383-win32-vc140.7z
+ libass-d18a5f1-win32-vc140.7z
 -libbluray-0.9.3-win32-vc140.7z
 +libbluray-0.9.2-mvc-win32-vc120.7z
  libcdio-0.9.3-win32-vc140.7z
  libcec-4.0.1-win32-vc140-2.7z
  libfribidi-0.19.2-win32.7z
 
-From cab608dce138c7ac52f9acb37945a6d2bbe9a523 Mon Sep 17 00:00:00 2001
+From 30060bc20c7f25701009d77d6b566e26ef77fa14 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 29 Feb 2016 17:00:50 +0000
-Subject: [PATCH 47/71] libbluray: Bump to Nevcairie's v0.9.2
+Subject: [PATCH 47/75] libbluray: Bump to Nevcairie's v0.9.2
 
 This includes 3D support
 ---
@@ -51174,10 +51258,10 @@ index 0000000000000000000000000000000000000000..5ef0124e35c9d81143921a328e272220
 + 
 +     return fp;
 
-From 336c8898720e5c9f50115b1a359188b44f1fec11 Mon Sep 17 00:00:00 2001
+From d3ad5d1c9d8da1ee7c63cd9302bef058b1da1135 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 6 Mar 2016 12:54:59 +0000
-Subject: [PATCH 48/71] mvc: Automatically enable stereo mode
+Subject: [PATCH 48/75] mvc: Automatically enable stereo mode
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.cpp | 6 +++++-
@@ -51235,10 +51319,10 @@ index 311dd6689236d660919c4c4483c51dca2752514a..536332c43e22ccb229e72b88518e54dd
      break;
      case AV_CODEC_ID_MPEG4:
 
-From e8fc139cb043e1718a8cf8e348fefcc4d00f9acf Mon Sep 17 00:00:00 2001
+From f1b065ebbb0f130da3e28a6a4375f9458cee3fd3 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 24 Mar 2016 13:02:58 +0000
-Subject: [PATCH 49/71] ffmpeg: mvc: fix for pixelation from packets with no
+Subject: [PATCH 49/75] ffmpeg: mvc: fix for pixelation from packets with no
  pts/dts
 
 ---
@@ -51300,10 +51384,10 @@ index 7e97e4d91a443d46d933df528763422ff5e8f4fa..d4f279fd4f2ceb260698cd6fedb124ba
  	cd $(PLATFORM);\
  	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
 
-From 5f2316bdcc751de483d4a52eee31c1c1786469a9 Mon Sep 17 00:00:00 2001
+From 332a8c9c8739a159f62542856c686ee14e996bdd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 11 Nov 2016 15:53:53 +0000
-Subject: [PATCH 50/71] stereoscopicmanager: fixups for rbp
+Subject: [PATCH 50/75] stereoscopicmanager: fixups for rbp
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/DVDCodecUtils.cpp | 61 ++++++++++++++++++++++
@@ -51541,10 +51625,10 @@ index 6aaa82f4d883b8cae0ccdedf6c5a6814e7aaa720..cc929b599125a44ac128713fd4331782
  };
  
 
-From 32b9a9ab2a9ab92008ae9cc6250b6b898de804f8 Mon Sep 17 00:00:00 2001
+From 2d81f94dcaf52e951bb7e203ea248b48c24d15aa Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 51/71] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 51/75] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -51564,10 +51648,10 @@ index a8323f419e404037c4e5fb4d78fa1b45409337a7..7c0b70777556ac7694e7fc511cd4bb18
    }
  
 
-From 352b7f1fac766e04179adaf308ad544b31b604cb Mon Sep 17 00:00:00 2001
+From 48664856527a85a6d242649a5dcebf85d9420171 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Nov 2016 18:24:18 +0000
-Subject: [PATCH 52/71] DemuxMVC: fixup after SeekTime API change
+Subject: [PATCH 52/75] DemuxMVC: fixup after SeekTime API change
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DemuxMVC.cpp | 2 +-
@@ -51601,10 +51685,36 @@ index bbb836a61344689a83af68c821c05c212a86b097..54f91a02391368fbfbb4d669c003f425
    virtual int GetStreamLength() { return 0; };
    virtual CDemuxStream* GetStream(int iStreamId) const override { return nullptr; };
 
-From 6a1debd2bc377a5d68fbed8c0c134898cc28e4e1 Mon Sep 17 00:00:00 2001
+From 945b547c444e7ec5039c88e31b612c57b25edd1b Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 3 Nov 2014 23:17:46 +0000
+Subject: [PATCH 53/75] [cec] Don't discard buttons when repeat mode is enabled
+
+---
+ xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+index 30367a3fde956090afdca9930fa52e829f35046f..febacb3b7964eab3b8615a6a807e0f27d911b4da 100644
+--- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
++++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+@@ -803,7 +803,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
+   CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
+ 
+   CSingleLock lock(m_critSection);
+-  if (key.iDuration > 0)
++  // avoid the queue getting too long
++  if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
++    return;
++  if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
+   {
+     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
+     {
+
+From 70d24188f34e2846d42f18146baf43952c31aae3 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 4 Nov 2014 18:50:00 +0000
-Subject: [PATCH 54/71] [cec] Temp - more logging
+Subject: [PATCH 54/75] [cec] Temp - more logging
 
 ---
  xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 8 +++++++-
@@ -51656,10 +51766,10 @@ index febacb3b7964eab3b8615a6a807e0f27d911b4da..52d6e6a7ab68ce91faf5a3881b23ea7a
  }
  
 
-From 307a15f3b87951d023c06b73f0116dd8af1c9382 Mon Sep 17 00:00:00 2001
+From 0d75b80f8862d67a4edc9f769acc0d18448ad268 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 25 May 2016 18:31:17 +0100
-Subject: [PATCH 55/71] rbp: Hard code the number of buffers to improve audio
+Subject: [PATCH 55/75] rbp: Hard code the number of buffers to improve audio
  sync
 
 ---
@@ -51701,10 +51811,10 @@ index fd8a0a2447c40357a9e13003f2ef45ef20ccb205..be0de0d962fd374bc17bfa48a27ca17d
  
  }
 
-From e384deac300900920e7cc9fd489e487dc63668ef Mon Sep 17 00:00:00 2001
+From b7bcc39b920c47e7c4273895feae92d4a82ba08f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 4 Jul 2016 18:30:03 +0100
-Subject: [PATCH 56/71] rbp: Update the GL libs to new naming scheme
+Subject: [PATCH 56/75] rbp: Update the GL libs to new naming scheme
 
 As the opensource mesa GL library is getting more usable, the name collision wih the firmware GL driver is causing issues.
 As such we are renaming the firmware GL driver to avoid this.
@@ -51718,7 +51828,7 @@ will be dropped at some point
  3 files changed, 5 insertions(+), 5 deletions(-)
 
 diff --git a/configure.ac b/configure.ac
-index 9bd8d6ec09ff5cf0c6e6caf39850f650f1dd2665..291ff72c1845037f97e215232ab1c2667687f289 100644
+index cbaefbe0a6a42f7d863800d87281a3f680cfea5b..2329e126f807b3eccb8cfd4e6ef3117ec20c85b5 100644
 --- a/configure.ac
 +++ b/configure.ac
 @@ -949,7 +949,7 @@ if test "$use_gles" = "yes"; then
@@ -51769,10 +51879,10 @@ index 3626ea5204eb561dc1ae0b64c6bb7253d2ec59ec..100ff3178bafe7434bd5456100b5bb71
  fi
  
 
-From 03bbec2df7c5901415ba5496245ed25ba5841181 Mon Sep 17 00:00:00 2001
+From e63ee8ac3fd87a12bdcf197827a182043e58b4af Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 28 Jun 2016 14:46:01 +0100
-Subject: [PATCH 57/71] ffmpeg: hacky fix for files with GMC
+Subject: [PATCH 57/75] ffmpeg: hacky fix for files with GMC
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 4 ++--
@@ -51794,10 +51904,10 @@ index 9149698884c8ae6a23649abbaa0e659587dfe982..84d515e9e2df6a4c1c448a52a42f4675
          {
            if (pStream->codec->codec_id == AV_CODEC_ID_PROBE)
 
-From 3d5d1b8ef8d74c6c4a41cf7e654c7435f4fe52eb Mon Sep 17 00:00:00 2001
+From 73498b227b428c32c7e5ebc5623d094020fe98a7 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 19 Jul 2016 20:39:18 +0100
-Subject: [PATCH 58/71] mmalrender: Add sharpness control
+Subject: [PATCH 58/75] mmalrender: Add sharpness control
 
 ---
  addons/resource.language.en_gb/resources/strings.po         |  2 +-
@@ -51869,10 +51979,10 @@ index e0e6f7c0e0546013ca74265aef54704fd332f8e4..69eae6cbef0131d20dc979dcb35915cd
    CCriticalSection m_sharedSection;
    MMAL_COMPONENT_T *m_vout;
 
-From 5375fb0e6dc4ba69f348d0ded98ff36c2084e47e Mon Sep 17 00:00:00 2001
+From 57c94de16036e00a6822e374cc8ebbc8a042dc6b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 14 Oct 2016 15:37:53 +0100
-Subject: [PATCH 59/71] MMALFFMpeg: Report as SW decode in codec overlay info
+Subject: [PATCH 59/75] MMALFFMpeg: Report as SW decode in codec overlay info
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 2 +-
@@ -51892,10 +52002,10 @@ index 8bace5b3eb98b3b1ddad7f56af83a41ae067bc75..c820a04c903866862b5ff04b38124ff0
    CLog::Log(LOGDEBUG, "CDVDVideoCodecFFmpeg - Updated codec: %s", m_name.c_str());
  }
 
-From 71184cfffaf0216c67b9dd3c600d5c8d805e984a Mon Sep 17 00:00:00 2001
+From 43c6b165b6d0f754f938d54bba00655d436679fd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Nov 2016 18:28:01 +0000
-Subject: [PATCH 60/71] advancedsettings: Add option to set cache size on
+Subject: [PATCH 60/75] advancedsettings: Add option to set cache size on
  libass
 
 E.g to set total cache size in libass to 32M
@@ -51997,10 +52107,10 @@ index 6b0e3b8cf9e3ff40e6af758c54fe7eefb89a131c..35bf38719f0eaaa5ac29e9495480ae97
      unsigned int m_jsonTcpPort;
  
 
-From 147c688154b43f57ec048166d901920594d70c28 Mon Sep 17 00:00:00 2001
+From 84623dff0ea921cf494fb9f15379b1bbc43844a0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 13 Nov 2016 20:30:15 +0000
-Subject: [PATCH 61/71] [rbp] Experimental limit libass cache size depending on
+Subject: [PATCH 61/75] [rbp] Experimental limit libass cache size depending on
  arm memory size
 
 ---
@@ -52046,10 +52156,10 @@ index 7f3325392993823b8d2d6a915579c48401ca2c12..410ad30aeb60316e9438ee56aaca7e73
    m_libAssCache = 0;
  
 
-From f4d6563f06f5c674a8b845daca0c58913a0f1712 Mon Sep 17 00:00:00 2001
+From b5d95824c6e029b58aaf3b1d6dd2774661925096 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 22 Jun 2015 21:46:57 +0100
-Subject: [PATCH 62/71] [rbp] Use default resampling setting on Pi2
+Subject: [PATCH 62/75] [rbp] Use default resampling setting on Pi2
 
 ---
  system/settings/rbp2.xml | 5 +++++
@@ -52072,10 +52182,10 @@ index 50bd55e9c90864c1ff4c36c4650e9ec247737a44..f218216e615d9723e5a163aab9c42ca5
    </section>
  </settings>
 
-From cb0855a5d1f3d8d64c3d780eee4d36a8cae7e460 Mon Sep 17 00:00:00 2001
+From c6165dc89c629abd2583eb7181e0543d6b69c255 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 1 Dec 2016 17:06:01 +0000
-Subject: [PATCH 63/71] MMALRender: Allow advanced deinterlace with software
+Subject: [PATCH 63/75] MMALRender: Allow advanced deinterlace with software
  decode
 
 Uses YUV420 directly which improves performance.
@@ -52098,10 +52208,10 @@ index f5f0f0d01227b3b4dcebb4a22a54dbcaac2d5ee9..05cbd8eeaef1a21fc32ea1fa23ea686e
      status = mmal_port_format_commit(m_deint_output);
      if (status != MMAL_SUCCESS)
 
-From 92cf602789202ca31230bba940bb2e2e551ecbeb Mon Sep 17 00:00:00 2001
+From 15e9791cb79c6c3b5f8c09bba979761451bea04c Mon Sep 17 00:00:00 2001
 From: Nuno Senica <nsenica@gmail.com>
 Date: Tue, 27 Dec 2016 20:59:56 +0000
-Subject: [PATCH 64/71] Apply ffmpeg patches automatically after downloading
+Subject: [PATCH 64/75] Apply ffmpeg patches automatically after downloading
  and extracting the ffmpeg tar ball
 
 ---
@@ -52132,10 +52242,10 @@ index 7c68b4c3d09a037d3b85c81604d47a7ea6dd1c21..eec635ef493d13ea97c9b806eb57cccb
    file(WRITE ${CMAKE_BINARY_DIR}/${CORE_BUILD_DIR}/ffmpeg/ffmpeg-link-wrapper
  "#!/bin/bash
 
-From c1f3b380540377b5b710c46066aa79fa64bc696e Mon Sep 17 00:00:00 2001
+From 358df1970de1f6f107e1681785ed723db0756f0e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 1 May 2016 19:56:43 +0100
-Subject: [PATCH 65/71] omxplayer: Avoid CAEFactory::Suspend which should only
+Subject: [PATCH 65/75] omxplayer: Avoid CAEFactory::Suspend which should only
  be called by application
 
 ---
@@ -52235,10 +52345,10 @@ index db7f98ddbc2db2f20bdc42379df3f08eba165bfc..02acfc8cfe57446be4e00b991ef6fde9
    COMXCoreComponent m_omx_render_analog;
    COMXCoreComponent m_omx_render_hdmi;
 
-From 1f20fe91f925c482201feb608b254c4afd235459 Mon Sep 17 00:00:00 2001
+From dd69c1880f97b81981df1ad50f09bfb457ad8532 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 1 Mar 2017 21:40:22 +0000
-Subject: [PATCH 66/71] MMALRender: default to bob (x2) deinterlace for HD
+Subject: [PATCH 66/75] MMALRender: default to bob (x2) deinterlace for HD
 
 There are still issues with some dvb dongles run on the same Pi as playback.
 Default to bob. Users who aren't using these devices will have to manually enable advanced.
@@ -52280,10 +52390,10 @@ index 39bc0530cecd54ae8c3a5481c92f1a6a18a4d9c5..cb0a06888a919879155fea2a689c1bae
    if (m_deinterlace && interlace_method != VS_INTERLACEMETHOD_NONE)
    {
 
-From 8305dec169baacde92f6d2ca4f791d3f281fa67d Mon Sep 17 00:00:00 2001
+From b96bf65f71bca91e4e029ed64c7e3dc86c0d0dad Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 17 Feb 2017 17:58:13 +0000
-Subject: [PATCH 67/71] ffmpeg: Update hevc optimisation to use the gpu service
+Subject: [PATCH 67/75] ffmpeg: Update hevc optimisation to use the gpu service
 
 ---
  project/cmake/modules/FindFFMPEG.cmake             |    14 +-
@@ -105595,10 +105705,10 @@ index e172ebf157aebffe1ae50b4a2b25fd71bc708c93..852815d5f4ae80771c5304f6f3520b5e
 ++
 ++
 
-From e6a8a101454e409fd2b6c61324d26252541b6d29 Mon Sep 17 00:00:00 2001
+From 1ec8569a01645467680e3090afba9927cea120d0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 4 Mar 2017 19:25:40 +0000
-Subject: [PATCH 68/71] ffmpeg: Call get_format to fix an issue with MMAL
+Subject: [PATCH 68/75] ffmpeg: Call get_format to fix an issue with MMAL
  rendering
 
 ---
@@ -105720,10 +105830,10 @@ index 3d970429012c1f3aede4df0545ced5006c165d50..e070d96fc340f5bff94d72ae9004c4a9
  CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
  ./configure --prefix=$FFMPEG_PREFIX \
 
-From 63c5c8ffa14cc3869411b7ba30198362c7a070d1 Mon Sep 17 00:00:00 2001
+From b230c015d539db71bb2eb04232b25805703014c6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Mar 2017 22:52:37 +0000
-Subject: [PATCH 69/71] MMAL: Remove periodic prime calls and handle from
+Subject: [PATCH 69/75] MMAL: Remove periodic prime calls and handle from
  buffer destructor
 
 If a number of buffers are released at once we can end up stalled in GetPicture with the buffers
@@ -105817,10 +105927,10 @@ index 9279966fa634f6f5a3e00f12dd528337392cf038..c6ba9b024b3c3bbe53d3f0870dd8c839
    CLog::Log(LOGDEBUG, "%s::%s - stopping", CLASSNAME, __func__);
  }
 
-From 4ce3f65bb006dfa4b8d49b58b257728848a9e8dd Mon Sep 17 00:00:00 2001
+From 6f29617ca776bb2e6459a55710a4a569311c8d7e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Mar 2017 23:22:43 +0000
-Subject: [PATCH 70/71] MMALCodec: Include a timeout of GetPicture in default
+Subject: [PATCH 70/75] MMALCodec: Include a timeout of GetPicture in default
  debug logging
 
 ---
@@ -105841,10 +105951,10 @@ index 22d594cdc217f32f820e3618b4d90a1d75fc769b..e8bc3b930e84e058460b6cfd7caca0d7
  
    return ret;
 
-From 472ede7c96085b80de6779bebd0bbd3482c3b02d Mon Sep 17 00:00:00 2001
+From a3185132fc1828162ad59e09155464b26a7f35b0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 21 Mar 2017 20:15:55 +0000
-Subject: [PATCH 71/71] ffmpeg: Add calls to init and deinit gpu service
+Subject: [PATCH 71/75] ffmpeg: Add calls to init and deinit gpu service
 
 ---
  tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch | 6 ++++--
@@ -105883,3 +105993,15067 @@ index 852815d5f4ae80771c5304f6f3520b5e49b18a67..b4c15b782a4deb36c35a006e8547ce69
  +  vcsm_exit();
  +
  +  mbox_close(mb);
+
+From 9ef1f2fdde0e49ae3c5da03defa83d32ab2e432d Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 27 Mar 2017 20:06:42 +0100
+Subject: [PATCH 72/75] squash: ffmpeg: hevc: Remove rules that require qasm
+
+---
+ tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch | 12 ------------
+ 1 file changed, 12 deletions(-)
+
+diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+index b4c15b782a4deb36c35a006e8547ce69665a10fe..58379fb0874521205184c53be5aae893cfd39d49 100644
+--- a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+@@ -308,18 +308,6 @@ index fd0d1f0..40d22d2 100644
+         vorbis_parser.o                                                  \
+         xiph.o                                                           \
+  
+-@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+- $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+- $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
+- endif
+-+
+-+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+-+	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+-+
+-+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+-+	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+-+
+-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
+ diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+ index 54efaad..02a89c3 100644
+ --- a/libavcodec/allcodecs.c
+
+From 38a49f21a7430779830d9d4e2468e76de6faf92c Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 19 May 2017 15:11:37 +0100
+Subject: [PATCH 73/75] RBP: Add api to query gpu frame geometry
+
+---
+ xbmc/linux/RBP.cpp | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ xbmc/linux/RBP.h   | 12 +++++++
+ 2 files changed, 108 insertions(+), 1 deletion(-)
+
+diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
+index 238eba372af2cbab11d7543c857ee47640901d13..79f932378cf37747be79e65fd0c2e2476f95474f 100644
+--- a/xbmc/linux/RBP.cpp
++++ b/xbmc/linux/RBP.cpp
+@@ -27,6 +27,7 @@
+ #include "utils/log.h"
+ 
+ #include "cores/omxplayer/OMXImage.h"
++#include <interface/mmal/mmal.h>
+ 
+ #include <sys/ioctl.h>
+ #include "rpi/rpi_user_vcsm.h"
+@@ -39,6 +40,41 @@
+ static int mbox_open();
+ static void mbox_close(int file_desc);
+ 
++typedef struct vc_image_extra_uv_s {
++   void *u, *v;
++   int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++   VC_IMAGE_EXTRA_UV_T uv;
++} VC_IMAGE_EXTRA_T;
++
++struct VC_IMAGE_T {
++   unsigned short                  type;           /* should restrict to 16 bits */
++   unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
++   unsigned short                  width;          /* width in pixels */
++   unsigned short                  height;         /* height in pixels */
++   int                             pitch;          /* pitch of image_data array in bytes */
++   int                             size;           /* number of bytes available in image_data array */
++   void                           *image_data;     /* pixel data */
++   VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
++   void                           *metadata;       /* metadata header for the image */
++   void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
++   uint32_t                        mem_handle;     /* the mem handle for relocatable memory storage */
++   int                             metadata_size;  /* size of metadata of each channel in bytes */
++   int                             channel_offset; /* offset of consecutive channels in bytes */
++   uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++   uint8_t                         num_channels;   /* number of channels (2 for stereo) */
++   uint8_t                         current_channel;/* the channel this header is currently pointing to */
++   uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++   uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
++                                                             into a linked-mulitchannel image */
++   uint8_t                         channel_index;         /* index of the channel this header represents while
++                                                             it is being linked. */
++   uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
++};
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
+ CRBP::CRBP()
+ {
+   m_initialized     = false;
+@@ -322,7 +358,7 @@ static unsigned mem_lock(int file_desc, unsigned handle)
+    return p[5];
+ }
+ 
+-unsigned mem_unlock(int file_desc, unsigned handle)
++static unsigned mem_unlock(int file_desc, unsigned handle)
+ {
+    int i=0;
+    unsigned p[32];
+@@ -341,6 +377,32 @@ unsigned mem_unlock(int file_desc, unsigned handle)
+    return p[5];
+ }
+ 
++
++#define GET_VCIMAGE_PARAMS 0x30044
++static int get_image_params(int file_desc, VC_IMAGE_T * img)
++{
++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++    uint32_t * p = buf;
++    void * rimg;
++    int rv;
++
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = GET_VCIMAGE_PARAMS;
++    *p++ = sizeof(*img);
++    *p++ = sizeof(*img);
++    rimg = p;
++    memcpy(p, img, sizeof(*img));
++    p += sizeof(*img) / sizeof(*p);
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
++
++    rv = mbox_property(file_desc, buf);
++    memcpy(img, rimg, sizeof(*img));
++
++    return rv;
++}
++
+ CGPUMEM::CGPUMEM(unsigned int numbytes, bool cached)
+ {
+   m_numbytes = numbytes;
+@@ -372,6 +434,39 @@ void CGPUMEM::Flush()
+   vcsm_clean_invalid( &iocache );
+ }
+ 
++AVRpiZcFrameGeometry CRBP::GetFrameGeometry(uint32_t encoding, unsigned short video_width, unsigned short video_height)
++{
++  AVRpiZcFrameGeometry geo = {};
++  struct VC_IMAGE_T img = {};
++
++  if (encoding == MMAL_ENCODING_YUVUV128)
++  {
++    img.type = VC_IMAGE_YUV_UV;
++    img.width = video_width;
++    img.height = video_height;
++    int rc = get_image_params(GetMBox(), &img);
++    assert(rc == 0);
++    const unsigned int stripe_w = 128;
++    geo.stride_y = stripe_w;
++    geo.stride_c = stripe_w;
++    geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++    geo.height_c = img.pitch / stripe_w - geo.height_y;
++    geo.planes_c = 1;
++    geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++  }
++  else if (encoding == MMAL_ENCODING_I420)
++  {
++    geo.stride_y = (video_width + 31) & ~31;
++    geo.stride_c = geo.stride_y / 2;
++    geo.height_y = (video_height + 15) & ~15;
++    geo.height_c = geo.height_y / 2;
++    geo.planes_c = 2;
++    geo.stripes = 1;
++  }
++  else assert(0);
++  return geo;
++}
++
+ double CRBP::AdjustHDMIClock(double adjust)
+ {
+   char response[80];
+diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
+index 815d758e7086d73b4d4eb16849fdbb509a3c251d..a7f07403854b81996cca72eff82e3a7d591c9209 100644
+--- a/xbmc/linux/RBP.h
++++ b/xbmc/linux/RBP.h
+@@ -41,6 +41,17 @@
+ #include "threads/CriticalSection.h"
+ #include "threads/Event.h"
+ 
++
++typedef struct AVRpiZcFrameGeometry
++{
++  unsigned int stride_y;
++  unsigned int height_y;
++  unsigned int stride_c;
++  unsigned int height_c;
++  unsigned int planes_c;
++  unsigned int stripes;
++} AVRpiZcFrameGeometry;
++
+ class CGPUMEM
+ {
+ public:
+@@ -82,6 +93,7 @@ public:
+   uint32_t WaitVsync(uint32_t target = ~0U);
+   void VSyncCallback();
+   int GetMBox() { return m_mb; }
++  AVRpiZcFrameGeometry GetFrameGeometry(uint32_t encoding, unsigned short video_width, unsigned short video_height);
+   double AdjustHDMIClock(double adjust);
+   double GetAdjustHDMIClock() { return m_actual_pll_adjust; }
+ 
+
+From 1856e86917eef62f3069c465d7c8ff2f8e213395 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 19 May 2017 15:12:28 +0100
+Subject: [PATCH 74/75] MMALFFmpeg: Add Sand/YUVUV128 support
+
+---
+ .../DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp        |  2 +-
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h |  3 ++
+ .../VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp     | 51 +++++++++++++++-------
+ 3 files changed, 39 insertions(+), 17 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+index c820a04c903866862b5ff04b38124ff0f7f7c17f..8444d0df598caef958e4ac3254419f3b4f95c513 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+@@ -241,7 +241,7 @@ enum AVPixelFormat CDVDVideoCodecFFmpeg::GetFormat(struct AVCodecContext * avctx
+ #endif
+ 
+ #ifdef HAS_MMAL
+-    if (*cur == AV_PIX_FMT_YUV420P)
++    if (*cur == AV_PIX_FMT_YUV420P || *cur == AV_PIX_FMT_SAND128)
+     {
+       MMAL::CDecoder* dec = new MMAL::CDecoder(ctx->m_processInfo, ctx->m_hints);
+       if(dec->Open(avctx, ctx->m_pCodecContext, *cur, ctx->m_uSurfacesCount))
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h
+index 1e49f09574c2a93b938d5eb405ebcb06543dec01..aecf0c54093092332b4a31a694472669cec84cb5 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h
+@@ -41,6 +41,7 @@
+ #include "rendering/RenderSystem.h"
+ #include "cores/VideoPlayer/VideoRenderers/BaseRenderer.h"
+ #include "cores/VideoPlayer/DVDResource.h"
++#include "linux/RBP.h"
+ 
+ 
+ enum MMALState { MMALStateNone, MMALStateHWDec, MMALStateFFDec, MMALStateDeint, };
+@@ -60,6 +61,8 @@ public:
+   unsigned int m_height;
+   unsigned int m_aligned_width;
+   unsigned int m_aligned_height;
++  unsigned int m_size;
++  AVRpiZcFrameGeometry m_geo;
+   uint32_t m_encoding;
+   float m_aspect_ratio;
+   MMALState m_state;
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp
+index f9b7172c45d5a0158259ebfb53ea75696f0acb6d..456214a679779469ea52db7ce846a3871147f685 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp
+@@ -47,7 +47,6 @@ using namespace MMAL;
+ CMMALYUVBuffer::CMMALYUVBuffer(CDecoder *omv, std::shared_ptr<CMMALPool> pool, uint32_t mmal_encoding, uint32_t width, uint32_t height, uint32_t aligned_width, uint32_t aligned_height, uint32_t size)
+ : CMMALBuffer(pool), m_omv(omv)
+ {
+-  uint32_t size_pic = 0;
+   m_width = width;
+   m_height = height;
+   m_aligned_width = aligned_width;
+@@ -57,21 +56,18 @@ CMMALYUVBuffer::CMMALYUVBuffer(CDecoder *omv, std::shared_ptr<CMMALPool> pool, u
+   mmal_buffer = nullptr;
+   m_rendered = false;
+   m_stills = false;
+-  if (m_encoding == MMAL_ENCODING_I420)
+-    size_pic = (m_aligned_width * m_aligned_height * 3) >> 1;
+-  else if (m_encoding == MMAL_ENCODING_YUVUV128)
+-    size_pic = (m_aligned_width * m_aligned_height * 3) >> 1;
+-  else if (m_encoding == MMAL_ENCODING_ARGB || m_encoding == MMAL_ENCODING_RGBA || m_encoding == MMAL_ENCODING_ABGR || m_encoding == MMAL_ENCODING_BGRA)
+-    size_pic = (m_aligned_width << 2) * m_aligned_height;
+-  else if (m_encoding == MMAL_ENCODING_RGB16)
+-    size_pic = (m_aligned_width << 1) * m_aligned_height;
+-  else assert(0);
+-  if (size)
++
++  if (size == 0)
+   {
+-    assert(size_pic <= size);
+-    size_pic = size;
++    m_geo = g_RBP.GetFrameGeometry(m_encoding, aligned_width, aligned_height);
++    const unsigned int size_y = m_geo.stride_y * m_geo.height_y;
++    const unsigned int size_c = m_geo.stride_c * m_geo.height_c;
++    m_size = (size_y + size_c * m_geo.planes_c) * m_geo.stripes;
+   }
+-  gmem = m_pool->AllocateBuffer(size_pic);
++  else
++    m_size = size;
++  assert(m_size > 0);
++  gmem = m_pool->AllocateBuffer(m_size);
+   if (gmem)
+     gmem->m_opaque = (void *)this;
+   if (VERBOSE && g_advancedSettings.CanLogComponent(LOGVIDEO))
+@@ -155,6 +151,8 @@ int CDecoder::FFGetBuffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+   uint32_t mmal_format = 0;
+   if (dec->m_fmt == AV_PIX_FMT_YUV420P)
+     mmal_format = MMAL_ENCODING_I420;
++  else if (frame->format == AV_PIX_FMT_SAND128)
++    mmal_format = MMAL_ENCODING_YUVUV128;
+   else if (dec->m_fmt == AV_PIX_FMT_ARGB)
+     mmal_format = MMAL_ENCODING_ARGB;
+   else if (dec->m_fmt == AV_PIX_FMT_RGBA)
+@@ -178,7 +176,7 @@ int CDecoder::FFGetBuffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+ 
+   CSingleLock lock(dec->m_section);
+   CGPUMEM *gmem = YUVBuffer->gmem;
+-  AVBufferRef *buf = av_buffer_create((uint8_t *)gmem->m_arm, (YUVBuffer->m_aligned_width * YUVBuffer->m_aligned_height * 3)>>1, CDecoder::FFReleaseBuffer, gmem, AV_BUFFER_FLAG_READONLY);
++  AVBufferRef *buf = av_buffer_create((uint8_t *)gmem->m_arm, YUVBuffer->m_size, CDecoder::FFReleaseBuffer, gmem, AV_BUFFER_FLAG_READONLY);
+   if (!buf)
+   {
+     CLog::Log(LOGERROR, "%s::%s av_buffer_create() failed", CLASSNAME, __FUNCTION__);
+@@ -203,6 +201,27 @@ int CDecoder::FFGetBuffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+     frame->data[1] = frame->data[0] + YUVBuffer->m_aligned_width * YUVBuffer->m_aligned_height;
+     frame->data[2] = frame->data[1] + (YUVBuffer->m_aligned_width>>1) * (YUVBuffer->m_aligned_height>>1);
+   }
++  else if (frame->format == AV_PIX_FMT_SAND128)
++  {
++    const unsigned int size_y = YUVBuffer->m_geo.stride_y * YUVBuffer->m_geo.height_y;
++    const unsigned int size_c = YUVBuffer->m_geo.stride_c * YUVBuffer->m_geo.height_c;
++
++    frame->buf[0] = buf;
++
++    frame->linesize[0] = YUVBuffer->m_geo.stride_y;
++    frame->linesize[1] = YUVBuffer->m_geo.stride_c;
++    frame->linesize[2] = YUVBuffer->m_geo.stride_c;
++    if (YUVBuffer->m_geo.stripes > 1)
++        frame->linesize[3] = YUVBuffer->m_geo.height_y + YUVBuffer->m_geo.height_c;      // abuse: linesize[3] = stripe stride
++
++    frame->data[0] = (uint8_t *)gmem->m_arm;
++    frame->data[1] = frame->data[0] + size_y;
++    if (YUVBuffer->m_geo.planes_c > 1)
++        frame->data[2] = frame->data[1] + size_c;
++
++    frame->extended_data = frame->data;
++    // Leave extended buf alone
++  }
+   else if (dec->m_fmt == AV_PIX_FMT_BGR0)
+   {
+     frame->buf[0] = buf;
+@@ -283,7 +302,7 @@ bool CDecoder::GetPicture(AVCodecContext* avctx, AVFrame* frame, DVDVideoPicture
+   if (!ret)
+     return false;
+ 
+-  if ((frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_BGR0 && frame->format != AV_PIX_FMT_RGB565LE) ||
++  if ((frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_SAND128 && frame->format != AV_PIX_FMT_BGR0 && frame->format != AV_PIX_FMT_RGB565LE) ||
+       frame->buf[1] != nullptr || frame->buf[0] == nullptr)
+     return false;
+ 
+
+From ed215d6a95935eabbbb5f56d9259b24e8ab4929d Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 19 May 2017 15:10:42 +0100
+Subject: [PATCH 75/75] ffmpeg: hevc: Update to latest version
+
+---
+ .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 11940 ++++++++++++-------
+ 1 file changed, 7660 insertions(+), 4280 deletions(-)
+
+diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+index 58379fb0874521205184c53be5aae893cfd39d49..96cfa9ae30e72b377b2561cf7a329e02b9212ceb 100644
+--- a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+@@ -11,7 +11,7 @@ index 524fb73..305632b 100644
+  /ffplay
+  /ffprobe
+ diff --git a/ffmpeg.c b/ffmpeg.c
+-index 9ffd833..7a86d7e 100644
++index 9ffd833..e2474e5 100644
+ --- a/ffmpeg.c
+ +++ b/ffmpeg.c
+ @@ -23,6 +23,11 @@
+@@ -52,7 +52,7 @@ index 9ffd833..7a86d7e 100644
+  #if HAVE_SYS_RESOURCE_H
+  #include <sys/time.h>
+  #include <sys/types.h>
+-@@ -158,6 +182,169 @@ static int restore_tty;
++@@ -158,6 +182,182 @@ static int restore_tty;
+  static void free_input_threads(void);
+  #endif
+  
+@@ -100,7 +100,7 @@ index 9ffd833..7a86d7e 100644
+ +  mmal_buffer_header_release(buffer);
+ +}
+ +
+-+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+++static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
+ +{
+ +    MMAL_COMPONENT_T* display;
+ +    MMAL_DISPLAYREGION_T region =
+@@ -111,7 +111,7 @@ index 9ffd833..7a86d7e 100644
+ +        .fullscreen = 0,
+ +        .dest_rect = {x, y, w, h}
+ +    };
+-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
+++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
+ +
+ +    bcm_host_init();  // TODO is this needed?
+ +    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+@@ -121,7 +121,7 @@ index 9ffd833..7a86d7e 100644
+ +
+ +    {
+ +        MMAL_ES_FORMAT_T* format = display->input[0]->format;
+-+        format->encoding = MMAL_ENCODING_I420;
+++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
+ +        format->es->video.width = geo.stride_y;
+ +        format->es->video.height = geo.height_y;
+ +        format->es->video.crop.x = 0;
+@@ -138,7 +138,7 @@ index 9ffd833..7a86d7e 100644
+ +    mmal_port_enable(display->input[0],display_cb_input);
+ +    mmal_port_enable(display->control,display_cb_control);
+ +
+-+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
+++    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
+ +
+ +    return display;
+ +}
+@@ -168,12 +168,24 @@ index 9ffd833..7a86d7e 100644
+ +#ifdef RPI_ZERO_COPY
+ +{
+ +    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+++    if (fr_buf == NULL) {
+++        mmal_buffer_header_release(buf);
+++        return;
+++    }
+ +
+ +    buf->user_data = fr_buf;
+ +    buf->data = av_rpi_zc_vc_handle(fr_buf);
+-+    buf->alloc_size =
+-+        buf->length = av_rpi_zc_numbytes(fr_buf);
+-+
+++    buf->offset = av_rpi_zc_offset(fr_buf);
+++    buf->length = av_rpi_zc_length(fr_buf);
+++    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+++#if 0
+++    {
+++        unsigned int n;
+++        for (n = 0; n < fr->width; n += 128) {
+++            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
+++        }
+++    }
+++#endif
+ +    ++rpi_display_count;
+ +}
+ +#else
+@@ -208,6 +220,7 @@ index 9ffd833..7a86d7e 100644
+ +
+ +static void display_exit(MMAL_COMPONENT_T* display)
+ +{
+++//    sleep(120);
+ +    if (display) {
+ +        mmal_component_destroy(display);
+ +    }
+@@ -222,7 +235,7 @@ index 9ffd833..7a86d7e 100644
+  /* sub2video hack:
+     Convert subtitles to video with alpha to insert them in filter graphs.
+     This is a temporary solution until libavfilter gets real subtitles support.
+-@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
++@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret)
+          avformat_close_input(&input_files[i]->ctx);
+          av_freep(&input_files[i]);
+      }
+@@ -234,7 +247,7 @@ index 9ffd833..7a86d7e 100644
+      for (i = 0; i < nb_input_streams; i++) {
+          InputStream *ist = input_streams[i];
+  
+-@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
++@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret)
+          av_freep(&ist->filters);
+          av_freep(&ist->hwaccel_device);
+  
+@@ -244,7 +257,7 @@ index 9ffd833..7a86d7e 100644
+          avcodec_free_context(&ist->dec_ctx);
+  
+          av_freep(&input_streams[i]);
+-@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
++@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret)
+      }
+      term_exit();
+      ffmpeg_exited = 1;
+@@ -252,7 +265,7 @@ index 9ffd833..7a86d7e 100644
+  }
+  
+  void remove_avoptions(AVDictionary **a, AVDictionary *b)
+-@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
++@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s,
+      if (ost->source_index >= 0)
+          ist = input_streams[ost->source_index];
+  
+@@ -260,7 +273,7 @@ index 9ffd833..7a86d7e 100644
+ +    if (next_picture && ist != NULL)
+ +    {
+ +        if (!rpi_display)
+-+           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
+++            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
+ +        display_frame(ist->dec_ctx, rpi_display, next_picture);
+ +    }
+ +#endif
+@@ -268,7 +281,7 @@ index 9ffd833..7a86d7e 100644
+      if (filter->inputs[0]->frame_rate.num > 0 &&
+          filter->inputs[0]->frame_rate.den > 0)
+          duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
+-@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
++@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+          ist->dec_ctx->opaque                = ist;
+          ist->dec_ctx->get_format            = get_format;
+          ist->dec_ctx->get_buffer2           = get_buffer;
+@@ -282,22 +295,23 @@ index 9ffd833..7a86d7e 100644
+  
+          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
+ diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+-index fd0d1f0..40d22d2 100644
++index fd0d1f0..1740768 100644
+ --- a/libavcodec/Makefile
+ +++ b/libavcodec/Makefile
+-@@ -5,6 +5,11 @@ NAME = avcodec
++@@ -5,6 +5,12 @@ NAME = avcodec
+  HEADERS = avcodec.h                                                     \
+            avdct.h                                                       \
+            avfft.h                                                       \
+ +          rpi_qpu.h                                                     \
+ +          rpi_shader.h                                                  \
+++	  rpi_shader_cmd.h                                              \
+ +          rpi_mailbox.h                                                 \
+ +          rpi_hevc_transform.h                                          \
+ +          rpi_zc.h                                                      \
+            d3d11va.h                                                     \
+            dirac.h                                                       \
+            dv_profile.h                                                  \
+-@@ -43,6 +48,10 @@ OBJS = allcodecs.o                                                      \
++@@ -43,6 +49,10 @@ OBJS = allcodecs.o                                                      \
+         resample.o                                                       \
+         resample2.o                                                      \
+         utils.o                                                          \
+@@ -308,6 +322,22 @@ index fd0d1f0..40d22d2 100644
+         vorbis_parser.o                                                  \
+         xiph.o                                                           \
+  
++@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
++ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
++ $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
++ endif
+++
+++QASM := $(SUBDIR)../pi-util/qasm.py
+++
+++ifneq ("$(wildcard $(QASM))","")
+++$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+++	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+++
+++$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+++	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+++endif
+++
+++$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
+ diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+ index 54efaad..02a89c3 100644
+ --- a/libavcodec/allcodecs.c
+@@ -321,12 +351,14 @@ index 54efaad..02a89c3 100644
+      REGISTER_PARSER(MJPEG,              mjpeg);
+      REGISTER_PARSER(MLP,                mlp);
+ diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+-index a4ceca7..1354c14 100644
++index a4ceca7..cafd25d 100644
+ --- a/libavcodec/arm/Makefile
+ +++ b/libavcodec/arm/Makefile
+-@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
++@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
++ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+++                                          arm/hevc_misc_neon.o          \
+                                            arm/hevcdsp_deblock_neon.o    \
+ +                                          arm/hevcdsp_epel_neon.o       \
+                                            arm/hevcdsp_idct_neon.o       \
+@@ -1015,18 +1047,592 @@ index 0000000..31d3c59
+ +#endif /* HAVE_ARMV6T2_INLINE */
+ +
+ +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
++diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
++new file mode 100644
++index 0000000..373576b
++--- /dev/null
+++++ b/libavcodec/arm/hevc_misc_neon.S
++@@ -0,0 +1,62 @@
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++@ rpi_zap_coeff_vals_neon(
+++@   uint16_t * buf,          [r0]
+++@   unsigned int log_n_m2)   [r1]
+++
+++function rpi_zap_coeff_vals_neon, export=1
+++        vmov.i64 q8, #0
+++        adr     r12, zc_tab
+++        vmov.i64 q9, #0
+++        tst     r0, #63
+++        vmov.i64 q10, #0
+++        add     r0, #63
+++        vmov.i64 q11, #0
+++        and     r0, #~63
+++        ldr     pc, [r12, r1, lsl #2]
+++
+++zc_tab:
+++        .word   zc_lc2
+++        .word   zc_lc3
+++        .word   zc_lc4
+++        .word   zc_lc5
+++
+++@ 4*4*2: "32 bytes" 64 or 0 depending on dst address
+++zc_lc2:
+++        it eq
+++        vstmeq  r0, {q8-q11}
+++        bx      lr
+++
+++@ 16*16*2 = 512 = 64 * 8
+++zc_lc4:
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++@ 8*8*2 = 128
+++zc_lc3:
+++        vstm    r0!, {q8-q11}
+++        vstm    r0,  {q8-q11}
+++        bx      lr
+++
+++@ 32*32*2 = 2048 = 128 * 16
+++zc_lc5:
+++        vmov.i64 q12, #0
+++        vmov.i64 q13, #0
+++        vmov.i64 q14, #0
+++        vmov.i64 q15, #0
+++        mov     r2, #4
+++1:
+++        vstm    r0!, {q8-q15}
+++        subs    r2, #1
+++        vstm    r0!, {q8-q15}
+++        vstm    r0!, {q8-q15}
+++        vstm    r0!, {q8-q15}
+++        bne     1b
+++        bx      lr
+++
+++endfunc
+++
+ diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+-index 166bddb..a088cc3 100644
++index 166bddb..9bd0a42 100644
+ --- a/libavcodec/arm/hevcdsp_deblock_neon.S
+ +++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+-@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
++@@ -15,7 +15,7 @@
++  *
++  * You should have received a copy of the GNU Lesser General Public
++  * License along with FFmpeg; if not, write to the Free Software
++- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
++  */
++ 
++ 
++@@ -31,6 +31,9 @@
++         bxeq     lr
++ .endm
++ 
+++@ Uses: d2, d4, d18, d19
+++@ Returns: d2, d4
+++@ Modifies: d0-d7, d22-d25
++ .macro hevc_loop_filter_chroma_body
++         vsubl.u8  q3, d4, d2
++         vsubl.u8  q11, d18, d19
++@@ -49,6 +52,33 @@
++         vqmovun.s16 d4, q2
++ .endm
++ 
+++
+++@ Uses r2[0:7], r2[8:15]
+++@ Modifies: d0-d7, d22-d25
+++.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
+++        vsubl.u8  q3, \Q0, \P0
+++        vsubl.u8  q11, \P1, \Q1
+++        vshl.i16  q3, #2
+++        vadd.i16  q11, q3
+++
+++        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
+++        vdup.16   d0, r2
+++        vmovl.u8  q0, d0
+++        vuzp.16   d0, d1
+++
+++        vrshr.s16 q11, q11, #3
+++        vneg.s16  q12, q0
+++        vmovl.u8  q2, \Q0
+++        vmin.s16  q11, q11, q0
+++        vmax.s16  q11, q11, q12
+++        vaddw.u8  q1, q11, \P0
+++        vsub.i16  q2, q11
+++        vqmovun.s16 \P0, q1
+++        vqmovun.s16 \Q0, q2
+++.endm
+++
+++
+++
++ .macro hevc_loop_filter_luma_start
++         ldr     r12, [r3]
++         ldr      r3, [r3, #4]
++@@ -60,15 +90,17 @@
++         lsr      r3, #16
++ .endm
++ 
++-.macro hevc_loop_filter_luma_body
+++@ Uses: r2, r3, r12
+++@ Modifies: r5, r6, r7, r8, r9
+++function hevc_loop_filter_luma_body
+++        vmovl.u8  q15, d23
+++        vmovl.u8  q14, d22
+++        vmovl.u8  q13, d21
+++        vmovl.u8  q12, d20
+++        vmovl.u8  q11, d19
+++        vmovl.u8  q10, d18
+++        vmovl.u8  q9, d17
++         vmovl.u8  q8, d16
++-        vmovl.u8  q9, d18
++-        vmovl.u8  q10, d20
++-        vmovl.u8  q11, d22
++-        vmovl.u8  q12, d24
++-        vmovl.u8  q13, d26
++-        vmovl.u8  q14, d28
++-        vmovl.u8  q15, d30
++ 
++         vadd.i16   q7, q9, q11
++         vadd.i16   q6, q14, q12
++@@ -77,7 +109,6 @@
++         vabd.s16   q7, q7, q10
++         vabd.s16   q6, q6, q13
++ 
++-
++         vdup.16    q0, r2
++         vmov       q4, q7
++         vmov       q5, q6
++@@ -152,7 +183,7 @@
++ 
++         and        r9, r8, r7
++         cmp        r9, #0
++-        beq        weakfilter_\@
+++        beq        weakfilter_
++ 
++         vadd.i16  q2, q11, q12
++         vadd.i16  q4, q9, q8
++@@ -210,11 +241,11 @@
++         vbit      q13, q3, q5
++         vbit      q14, q2, q5
++ 
++-weakfilter_\@:
+++weakfilter_:
++         mvn       r8, r8
++         and       r9, r8, r7
++         cmp       r9, #0
++-        beq       ready_\@
+++        beq       ready_
++ 
++         vdup.16    q4, r2
++ 
++@@ -275,75 +306,345 @@ weakfilter_\@:
++         vbit      q11, q0, q5
++         vbit      q12, q4, q5
++ 
++-ready_\@:
+++ready_:
++         vqmovun.s16 d16, q8
++-        vqmovun.s16 d18, q9
++-        vqmovun.s16 d20, q10
++-        vqmovun.s16 d22, q11
++-        vqmovun.s16 d24, q12
++-        vqmovun.s16 d26, q13
++-        vqmovun.s16 d28, q14
++-        vqmovun.s16 d30, q15
++-.endm
+++        vqmovun.s16 d17, q9
+++        vqmovun.s16 d18, q10
+++        vqmovun.s16 d19, q11
+++        vqmovun.s16 d20, q12
+++        vqmovun.s16 d21, q13
+++        vqmovun.s16 d22, q14
+++        vqmovun.s16 d23, q15
+++        mov       pc, lr
+++endfunc
+++
+++@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
+++function ff_hevc_v_loop_filter_luma2_neon_8, export=1
+++        hevc_loop_filter_luma_start
+++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
+++
+++        ldr      r4, [sp, #40]
+++        b        v_loop_luma_common
+++endfunc
+++
++ 
++ function ff_hevc_v_loop_filter_luma_neon, export=1
++         hevc_loop_filter_luma_start
++-        push     {r5-r11}
+++        push     {r4-r10,lr}
+++
+++        sub      r4, r0, #4
+++v_loop_luma_common:
+++        @ Why this isn't a bitmask to start with I have no idea...
+++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
+++        ldr      r5, [sp, #32]
+++        ldrh     r10, [r5]
+++        ldr      r5, [sp, #36]
+++        ldrh     r5, [r5]
+++        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
+++
++         vpush    {d8-d15}
++-        sub      r0, #4
++-        vld1.8   {d16}, [r0], r1
++-        vld1.8   {d18}, [r0], r1
++-        vld1.8   {d20}, [r0], r1
++-        vld1.8   {d22}, [r0], r1
++-        vld1.8   {d24}, [r0], r1
++-        vld1.8   {d26}, [r0], r1
++-        vld1.8   {d28}, [r0], r1
++-        vld1.8   {d30}, [r0], r1
++-        sub      r0, r0, r1, lsl #3
++-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
++-        hevc_loop_filter_luma_body
++-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
++-        vst1.8   {d16}, [r0], r1
++-        vst1.8   {d18}, [r0], r1
++-        vst1.8   {d20}, [r0], r1
++-        vst1.8   {d22}, [r0], r1
++-        vst1.8   {d24}, [r0], r1
++-        vst1.8   {d26}, [r0], r1
++-        vst1.8   {d28}, [r0], r1
++-        vst1.8   {d30}, [r0]
+++
+++        @ Uses slightly fewer instructions to do laned loads than unlaned
+++        @ and transpose.  This also means that we can use the same code for
+++        @ both split & unsplit deblock
+++        vld4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
+++        vld4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
+++
+++        vld4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+++        vld4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+++
+++        vld4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
+++        vld4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
+++
+++        vld4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+++        vld4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+++
+++        vld4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
+++        vld4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
+++
+++        vld4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+++        vld4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+++
+++        vld4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+++        vld4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+++
+++        vld4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
+++        vld4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
+++
+++        bl hevc_loop_filter_luma_body
+++
+++        neg     r1, r1
+++
+++        @ no_p[1]
+++        tst     r10, #0xff00
+++        itt ne
+++        addne    r4, r4, r1, lsl #2
+++        bne     1f
+++        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
+++        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+++        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
+++
+++1:
+++        @ no_q[1]
+++        tst     r10, #0xff000000
+++        itt ne
+++        addne    r0, r0, r1, lsl #2
+++        bne     2f
+++        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
+++        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+++        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
+++
+++2:
+++        @ no_p[0]
+++        tst     r10, #0xff
+++        bne     3f
+++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
+++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
+++
+++3:
+++        @ no_q[0]
+++        tst     r10, #0xff0000
+++        bne     4f
+++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
+++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
+++
+++4:
+++bypasswrite:
++         vpop     {d8-d15}
++-        pop      {r5-r11}
++-        bx lr
+++        pop      {r4-r10,pc}
++ endfunc
++ 
+++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
+++@                                 ptrdiff_t stride, [r1]
+++@                                 int beta,         [r2]
+++@                                 int32_t *tc,      [r3]
+++@                                 uint8_t *no_p,    sp[0]
+++@                                 uint8_t *no_q);   sp[4]
+++@
+++@ Src should always be on 8 byte boundry & all in the same slice
+++
++ function ff_hevc_h_loop_filter_luma_neon, export=1
++         hevc_loop_filter_luma_start
++-        push     {r5-r11}
+++        push     {r4-r10,lr}
+++
++         vpush    {d8-d15}
++         sub      r0, r0, r1, lsl #2
+++
++         vld1.8  {d16}, [r0], r1
+++        vld1.8  {d17}, [r0], r1
++         vld1.8  {d18}, [r0], r1
+++        vld1.8  {d19}, [r0], r1
++         vld1.8  {d20}, [r0], r1
+++        vld1.8  {d21}, [r0], r1
++         vld1.8  {d22}, [r0], r1
++-        vld1.8  {d24}, [r0], r1
++-        vld1.8  {d26}, [r0], r1
++-        vld1.8  {d28}, [r0], r1
++-        vld1.8  {d30}, [r0], r1
++-        sub        r0, r0, r1, lsl #3
++-        add        r0, r1
++-        hevc_loop_filter_luma_body
++-        vst1.8   {d18}, [r0], r1
++-        vst1.8   {d20}, [r0], r1
++-        vst1.8   {d22}, [r0], r1
++-        vst1.8   {d24}, [r0], r1
++-        vst1.8   {d26}, [r0], r1
++-        vst1.8   {d28}, [r0]
++-bypasswrite:
+++        vld1.8  {d23}, [r0]
+++
+++        bl hevc_loop_filter_luma_body
+++
++         vpop     {d8-d15}
++-        pop      {r5-r11}
++-        bx lr
+++
+++        neg     r1, r1
+++        add     r0, r0, r1
+++
+++        @ Why this isn't a bitmask to start with I have no idea...
+++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
+++        ldr      r5, [sp, #32]
+++        ldrh     r10, [r5]
+++        ldr      r5, [sp, #36]
+++        ldrh     r5, [r5]
+++        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+++        bne      1f
+++
+++        vst1.8  {d22}, [r0], r1
+++        vst1.8  {d21}, [r0], r1
+++        vst1.8  {d20}, [r0], r1
+++        vst1.8  {d19}, [r0], r1
+++        vst1.8  {d18}, [r0], r1
+++        vst1.8  {d17}, [r0]
+++
+++        pop      {r4-r10,pc}
+++
+++@ Partial write
+++1:
+++        vmov     r2, r3, d22
+++        vmov     r4, r5, d21
+++        vmov     r6, r7, d20
+++
+++        tst      r10, #0xff0000
+++        ittt eq
+++        streq    r2, [r0]
+++        streq    r4, [r0, r1]
+++        streq    r6, [r0, r1, lsl # 1]
+++
+++        add      r0, r0, #4
+++        tst      r10, #0xff000000
+++        ittt eq
+++        streq    r3, [r0]
+++        streq    r5, [r0, r1]
+++        streq    r7, [r0, r1, lsl # 1]
+++
+++        vmov     r2, r3, d19
+++        vmov     r4, r5, d18
+++        vmov     r6, r7, d17
+++        add      r0, r0, r1
+++        add      r0, r0, r1, lsl # 1
+++
+++        tst      r10, #0xff00
+++        ittt eq
+++        streq    r3, [r0]
+++        streq    r5, [r0, r1]
+++        streq    r7, [r0, r1, lsl # 1]
+++
+++        tst      r10, #0xff
+++        ittt eq
+++        streq    r2, [r0, #-4]!
+++        streq    r4, [r0, r1]
+++        streq    r6, [r0, r1, lsl # 1]
+++
+++        pop      {r4-r10,pc}
+++
++ endfunc
++ 
+++@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
+++@                                     unsigned int stride,   // r1
+++@                                     uint32_t tc4,          // r2
+++@                                     unsigned int no_f);    // r3
+++@
+++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++function ff_hevc_h_loop_filter_uv_neon_8, export=1
+++        sub      r0, r0, r1, lsl #1
+++        vld2.8   {d16,d17}, [r0], r1
+++        vld2.8   {d18,d19}, [r0], r1
+++        vld2.8   {d26,d27}, [r0], r1
+++        vld2.8   {d28,d29}, [r0]
+++        sub      r0, r0, r1, lsl #1
+++        hevc_loop_filter_uv_body d16, d18, d26, d28
+++        lsr      r2, r2, #16
+++        hevc_loop_filter_uv_body d17, d19, d27, d29
+++        cmp      r3, #0
+++        bne      1f
+++        vst2.8   {d18,d19}, [r0], r1
+++        vst2.8   {d26,d27}, [r0]
+++        bx       lr
+++
+++        @ At least one no_f bit is set
+++        @ Which means we need to break this apart in an ugly fashion
+++1:      vzip.8   d18, d19
+++        vzip.8   d26, d27
+++        sub      r1, r1, #8
+++
+++        tst      r3, #1
+++        bne      1f
+++        vst1.8   {d18}, [r0]
+++1:      add      r0, r0, #8
+++        tst      r3, #2
+++        bne      2f
+++        vst1.8   {d19}, [r0]
+++2:      add      r0, r0, r1
+++
+++        tst      r3, #4
+++        bne      1f
+++        vst1.8   {d26}, [r0]
+++1:      add      r0, r0, #8
+++        tst      r3, #8
+++        it ne
+++        bxne     lr
+++        vst1.8   {d27}, [r0]
+++        bx       lr
+++
+++endfunc
+++
+++
+++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+++@                                     unsigned int stride,   // r1
+++@                                     uint32_t tc4,          // r2
+++@                                     uint8_t * src_l,       // r3
+++@                                     unsigned int no_f);   // sp[0]
+++@
+++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++function ff_hevc_v_loop_filter_uv2_neon_8, export=1
+++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
+++        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
+++
+++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+++        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
+++
+++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+++        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
+++
+++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+++        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
+++
+++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+++        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
+++
+++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+++        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
+++
+++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+++        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
+++
+++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
+++        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
+++
+++        hevc_loop_filter_uv_body d16, d18, d26, d28
+++        lsr      r2, r2, #16
+++        hevc_loop_filter_uv_body d17, d19, d27, d29
+++
+++        neg      r1, r1
+++
+++        ldr      r2, [sp, #0]
+++
+++        @ p[1]
+++        tst      r2, #2
+++        itt ne
+++        addne    r3, r3, r1, lsl #2
+++        bne      1f
+++        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
+++        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+++        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+++        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+++
+++1:
+++        @ q[1]
+++        tst      r2, #8
+++        itt ne
+++        addne    r0, r0, r1, lsl #2
+++        bne 2f
+++        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
+++        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
+++        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
+++        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
+++
+++2:
+++        @ p[0]
+++        tst      r2, #1
+++        bne      3f
+++        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+++        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+++        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+++        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
+++
+++3:
+++        @ q[0]
+++        tst      r2, #4
+++        it ne
+++        bxne     lr
+++        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
+++        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
+++        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
+++        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
+++
+++        bx       lr
+++endfunc
+++
+++
++ function ff_hevc_v_loop_filter_chroma_neon, export=1
++         hevc_loop_filter_chroma_start
++         sub      r0, #4
++@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+          vst1.8   {d4}, [r0]
+          bx       lr
+  endfunc
+ +
+-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+-+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
+++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
+++ *                                            int *curr_rpl0, int *curr_
+++ *                                            MvField *curr, MvField *ne
+ + */
+ +function ff_hevc_deblocking_boundary_strengths_neon, export=1
+ +        add         ip, sp, #4*4
+@@ -1147,6 +1753,7 @@ index 166bddb..a088cc3 100644
+ +90:     mov         a3, #1
+ +        b           11b
+ +endfunc
+++
+ diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+ new file mode 100644
+ index 0000000..00eab9e
+@@ -1491,10 +2098,10 @@ index 0000000..00eab9e
+ +       .byte 2, 16, 54, 4
+ +       .byte 2, 10, 58, 2
+ diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
+-index 5591807..49c70dd 100644
++index 5591807..b6c48ee 100644
+ --- a/libavcodec/arm/hevcdsp_init_neon.c
+ +++ b/libavcodec/arm/hevcdsp_init_neon.c
+-@@ -22,6 +22,8 @@
++@@ -22,11 +22,26 @@
+  #include "libavutil/arm/cpu.h"
+  #include "libavcodec/hevcdsp.h"
+  #include "hevcdsp_arm.h"
+@@ -1503,7 +2110,25 @@ index 5591807..49c70dd 100644
+  
+  void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+  void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+-@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++
+++#ifdef RPI
+++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                             const uint8_t no_p[2], const uint8_t no_q[2],
+++                             uint8_t * _pix_l);
+++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                             unsigned int no_f);
+++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                             uint8_t * src_l,
+++                             unsigned int no_f);
+++#endif
+++
++ void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++ void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++ void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
++@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+  void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
+  
+@@ -1522,10 +2147,20 @@ index 5591807..49c70dd 100644
+ +void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+ +void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+ +
+++void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
+++                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
+++
+++void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++
+++
+  #define PUT_PIXELS(name) \
+      void name(int16_t *dst, uint8_t *src, \
+                                  ptrdiff_t srcstride, int height, \
+-@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
++@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+  #undef PUT_PIXELS
+@@ -1541,7 +2176,7 @@ index 5591807..49c70dd 100644
+  
+  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                     int height, int width);
+-@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
++@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+  }
+  
+@@ -1587,6 +2222,50 @@ index 5591807..49c70dd 100644
+ +    }
+ +}
+ +
+++static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+++{
+++    // Width 32 already dealt with
+++    // width 16 code works in double lines
+++    if (width == 16 && (height & 1) == 0) {
+++        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
+++                                          sao_offset_val_u, sao_left_class_u,
+++                                          sao_offset_val_v, sao_left_class_v,
+++                                          width, height);
+++    }
+++    else
+++    {
+++        const int shift  = 3; // BIT_DEPTH - 5
+++        int k, y, x;
+++        pixel *dst = (pixel *)_dst;
+++        pixel *src = (pixel *)_src;
+++        int8_t offset_table_u[32] = { 0 };
+++        int8_t offset_table_v[32] = { 0 };
+++
+++        stride_src /= sizeof(pixel);
+++        stride_dst /= sizeof(pixel);
+++
+++        for (k = 0; k < 4; k++)
+++            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+++        for (k = 0; k < 4; k++)
+++            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+++
+++        for (y = 0; y < height; y++) {
+++            for (x = 0; x < width * 2; x += 2)
+++            {
+++                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
+++                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
+++            }
+++            dst += stride_dst;
+++            src += stride_src;
+++
+++        }
+++    }
+++}
+++
+ +#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+ +static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+ +                                          int16_t *_sao_offset_val, int eo, int width, int height)
+@@ -1665,6 +2344,54 @@ index 5591807..49c70dd 100644
+ +        }
+ +    }
+ +}
+++
+++
+++static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height)
+++{
+++    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+++
+++    if (width == 32 && (height & 7) == 0) {
+++        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
+++    }
+++    else
+++    {
+++        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+++        static const int8_t pos[4][2][2] = {
+++            { { -1,  0 }, {  1, 0 } }, // horizontal
+++            { {  0, -1 }, {  0, 1 } }, // vertical
+++            { { -1, -1 }, {  1, 1 } }, // 45 degree
+++            { {  1, -1 }, { -1, 1 } }, // 135 degree
+++        };
+++        int8_t sao_offset_val_u[8];  // padding of 3 for vld
+++        int8_t sao_offset_val_v[8];  // padding of 3 for vld
+++        pixel *dst = (pixel *)_dst;
+++        pixel *src = (pixel *)_src;
+++        int a_stride, b_stride;
+++        int x, y;
+++
+++        for (x = 0; x < 5; x++) {
+++            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
+++            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
+++        }
+++
+++        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+++        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+++        for (y = 0; y < height; y++) {
+++            for (x = 0; x < width * 2; x += 2) {
+++                int diff0u = CMP(src[x], src[x + a_stride]);
+++                int diff1u = CMP(src[x], src[x + b_stride]);
+++                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+++                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+++                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
+++                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
+++            }
+++            src += stride_src;
+++            dst += stride_dst;
+++        }
+++    }
+++}
+ +#undef CMP
+ +
+ +void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+@@ -1674,18 +2401,36 @@ index 5591807..49c70dd 100644
+  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+  {
+      if (bit_depth == 8) {
+-@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++         int x;
++         c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
+++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon;
++         c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
++         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
++         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+++#ifdef RPI
+++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
+++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
+++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_8;
+++#endif
++         c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
++         c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
++         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
++@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+          c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+          c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+          c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+ +        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
+ +          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
+++          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
+ +          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
+++          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
+ +        }
+++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
+          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+-@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+@@ -1707,7 +2452,7 @@ index 5591807..49c70dd 100644
+          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+-@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+      }
+@@ -1719,10 +2464,10 @@ index 5591807..49c70dd 100644
+  }
+ diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
+ new file mode 100644
+-index 0000000..9c7808d
++index 0000000..08a021d
+ --- /dev/null
+ +++ b/libavcodec/arm/hevcdsp_sao_neon.S
+-@@ -0,0 +1,510 @@
++@@ -0,0 +1,862 @@
+ +/*
+ + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ + *
+@@ -1848,24 +2593,186 @@ index 0000000..9c7808d
+ +
+ +function ff_hevc_sao_band_w64_neon_8, export=1
+ +        init_sao_band
+++
+++        push      {r4, lr}
+++        subs      r12, #1
+++        mov       r4, r1
+++        it ne
+++        addne     r4, r3
+++
+ +1:      subs      r12, #1
+-+        pld       [r1, r3]
+-+        vld1.8    {q8-q9}, [r1, :128]!
+-+        vshr.u8  q12, q8, #3
+-+        vshr.u8  q13, q9, #3
+-+        vld1.8    {q10-q11}, [r1, :128], r3
+-+        vshr.u8  q14, q10, #3
+-+        vshr.u8  q15, q11, #3
+-+        sub       r1, #32
+++        vldm      r1, {q8-q11}
+++        pld       [r4]
+++        vshr.u8   q12, q8, #3
+++        vshr.u8   q13, q9, #3
+++        add       r1, r3
+++        vshr.u8   q14, q10, #3
+++        vshr.u8   q15, q11, #3
+ +        sao_band_64
+-+        vst1.8    {q8-q9}, [r0, :128]!
+-+        vst1.8    {q10-q11}, [r0, :128], r2
+++        it ne
+++        addne     r4, r3
+++        vstm      r0, {q8-q11}
+++        add       r0, r2
+++        bpl       1b
+++
+++        pop       {r4, pc}
+++endfunc
+++
+++
+++@ ff_hevc_sao_band_c_w64_neon_8(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+++
+++@ As this is often done in-place on the frame buffer it is worth preloading
+++@ the pixel values but we want to beware of loading ouside our buffer to avoid
+++@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
+++
+++function ff_hevc_sao_band_c_neon_8, export=1
+++        mov     r12, sp
+++        push   {r4-r8, lr}  // 24 bytes
+++
+++        ldm     r12, {r4-r7}
+++
+++        add     r4, #2
+++        add     r6, #2
+++        vld1.16 {d16}, [r4]    @ Unaligned
+++        lsl     r5, r5, #3
+++        vld1.16 {d18}, [r6]
+++        pld     [r1]
+++        vmov.i8  d17, #0
+++        mov     r4, r1
+++        vmov.i8  d19, #0
+++        lsl     r7, r7, #3
+++        vdup.8  q1, r5
+++        ldr     r5, [r12, #16]  @ width
+++        vdup.8  q2, r7
+++        ldr     r12, [r12, #20]
+++        vqmovn.s16 d0, q8
+++        cmp     r5, #16         @ At some point we may want a table lookup
+++        vqmovn.s16 d1, q9
+++        vmov.i8 q3, #128
+++        beq     16f
+++
+++        @ d0 U lookup
+++        @ d1 V lookup
+++        @ q1 U raw offset
+++        @ q2 V raw offset
+++        @ q3 #128
+++
+++        @ r4 = r1 = src - Inteded for preload pointer
+++        @ r12 = height
+++
+++        @ Might (unlikely) be called with height == 1
+++        subs      r12, #1
+++        it ne
+++        addne     r4, r3
+++
+++1:
+++        subs      r12, #1
+++        vld2.8    {q8-q9}, [r1, :128]!
+++        vsub.u8   q12, q8, q1
+++        vld2.8    {q10-q11}, [r1, :128], r3
+++        vsub.u8   q14, q10, q1
+++        vsub.u8   q13, q9, q2
+++        sub       r1, #32
+++        vsub.u8   q15, q11, q2
+++        pld       [r4]
+++        vshr.u8   q12, #3
+++        vadd.s8   q8, q3
+++        vshr.u8   q13, #3
+++        vadd.s8   q9, q3
+++
+++        vtbl.8   d24, {d0}, d24
+++        vshr.u8  q14, #3
+++        vtbl.8   d25, {d0}, d25
+++        vshr.u8  q15, #3
+++        vtbl.8   d26, {d1}, d26
+++        vadd.s8  q10, q3
+++        vtbl.8   d27, {d1}, d27
+++        vadd.s8  q11, q3
+++        vtbl.8   d28, {d0}, d28
+++        vqadd.s8 q8, q12
+++        vtbl.8   d29, {d0}, d29
+++        vqadd.s8 q9, q13
+++        vtbl.8   d30, {d1}, d30
+++        vqadd.s8 q10, q14
+++        vtbl.8   d31, {d1}, d31
+++        vsub.s8  q8, q3
+++        vqadd.s8 q11, q15
+++        vsub.s8  q9, q3
+++        vsub.s8  q10, q3
+++        vsub.s8  q11, q3
+++
+++        it ne
+++        addne     r4, r3        @ Do not inc on final pass
+++        vst2.8    {q8-q9}, [r0, :128]!
+++        vst2.8    {q10-q11}, [r0, :128], r2
+ +        sub       r0, #32
+-+        bne       1b
+++        bpl       1b
+++
+++        pop    {r4-r8, pc}
+++
+++@ -- width 16 (UV pairs) --
+++16:
+++        subs    r12, #2
+++        it ne
+++        addne   r4, r4, r3, lsl #1
+++
+++1:
+++        subs      r12, #2
+++        vld2.8    {q8-q9}, [r1, :128], r3
+++        vsub.u8   q12, q8, q1
+++        vld2.8    {q10-q11}, [r1, :128], r3
+++        vsub.u8   q14, q10, q1
+++        vsub.u8   q13, q9, q2
+++        pld       [r4]
+++        vsub.u8   q15, q11, q2
+++        pld       [r4, r3]
+++        vshr.u8  q12, #3
+++        vadd.s8  q8, q3
+++        vshr.u8  q13, #3
+++        vadd.s8  q9, q3
+++
+++        vtbl.8   d24, {d0}, d24
+++        vshr.u8  q14, #3
+++        vtbl.8   d25, {d0}, d25
+++        vshr.u8  q15, #3
+++        vtbl.8   d26, {d1}, d26
+++        vadd.s8  q10, q3
+++        vtbl.8   d27, {d1}, d27
+++        vadd.s8  q11, q3
+++        vtbl.8   d28, {d0}, d28
+++        vqadd.s8 q8, q12
+++        vtbl.8   d29, {d0}, d29
+++        vqadd.s8 q9, q13
+++        vtbl.8   d30, {d1}, d30
+++        vqadd.s8 q10, q14
+++        vtbl.8   d31, {d1}, d31
+++        vsub.s8  q8, q3
+++        vqadd.s8 q11, q15
+++        vsub.s8  q9, q3
+++        vsub.s8  q10, q3
+++        vsub.s8  q11, q3
+++
+++        it ne
+++        addne   r4, r4, r3, lsl #1
+++        vst2.8    {q8-q9}, [r0, :128], r2
+++        vst2.8    {q10-q11}, [r0, :128], r2
+++        bpl       1b
+++
+++        pop    {r4-r8, pc}
+ +
+-+        bx lr
+ +endfunc
+ +
+++
+ +.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
+ +        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
+ +        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
+@@ -1875,71 +2782,120 @@ index 0000000..9c7808d
+ +        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
+ +.endm
+ +
+-+.macro table64
+-+        vmov.s8 q13, #2 // 2 to all elements
+-+        vmov.32  d24[0], r4  // load offset table from general registers
+-+        vmov.32  d24[1], r5  // load rest of offset table
+-+
+-+        vadd.s8 q0, q13
+-+        vadd.s8 q1, q13
+-+        vadd.s8 q2, q13
+-+        vadd.s8 q3, q13
+-+
+-+        vmov.u8  q15, #128 // s8 #-128
+-+        vtbl.8   d0, {d24}, d0
+-+        vadd.s8  q13,  q4, q15
+-+        vtbl.8   d1, {d24}, d1
+-+        vadd.s8  q14,  q5, q15
+-+        vtbl.8   d2, {d24}, d2
+-+        vqadd.s8 q0, q13
+-+        vtbl.8   d3, {d24}, d3
+-+        vqadd.s8 q1, q14
+-+        vtbl.8   d4, {d24}, d4
+-+        vadd.s8  q13,  q6, q15
+-+        vtbl.8   d5, {d24}, d5
+-+        vadd.s8  q14,  q7, q15
+-+        vtbl.8   d6, {d24}, d6
+-+        vqadd.s8 q2, q13
+-+        vtbl.8   d7, {d24}, d7
+-+        vqadd.s8 q3, q14
+-+        vsub.s8   q0, q15
+-+        vsub.s8   q1, q15
+-+        vsub.s8   q2, q15
+-+        vsub.s8   q3, q15
+-+        vst1.8  {q0-q1}, [r0, :128]!
+-+        vst1.8  {q2-q3}, [r0, :128], r2
+-+        sub     r0, #32
+-+.endm
+ +
+ +// input
+ +// a in q0 - q3
+ +// c in q4 - q7
+ +// b in q8 - q11
+-+// offset table in r7 and r5
+++// offset table r4,r5 and r6,r7
+++//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
+ +// output in q0 - q3
+ +// clobbers q12 - q15
+-+.macro edge_w64_body
+-+        diff32 q12, q13, q0, q1, q0, q1, q4, q5
+-+        diff32 q0, q1, q14, q15, q8, q9, q4, q5
+ +
+-+        vadd.s8  q0, q12 //diff0 + diff1
+-+        vadd.s8  q1, q13
+++@ a <- c <- b
+++@
+++@ It appears that Neon can stall if you try and use results too soon so we try to
+++@ spread our instruction out
+++
+++.macro edgeidx64
+++
+++        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
+++        vcgt.u8 q13, q5, q1
+++        vcgt.u8 q14, q6, q2
+++        vcgt.u8 q15, q7, q3
+++
+++        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
+++        vcgt.u8 q1, q1, q5
+++        vcgt.u8 q2, q2, q6
+++        vcgt.u8 q3, q3, q7
+++
+++        vsub.s8 q0, q0, q12 // a = sign(c-a)
+++        vsub.s8 q1, q1, q13
+++        vsub.s8 q2, q2, q14
+++        vsub.s8 q3, q3, q15
+++
+++        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
+++        vcgt.u8 q13, q5, q9
+++        vcgt.u8 q14, q6, q10
+++        vcgt.u8 q15, q7, q11
+++
+++        vsub.s8 q0, q0, q12
+++        vsub.s8 q1, q1, q13
+++        vsub.s8 q2, q2, q14
+++        vsub.s8 q3, q3, q15
+++
+++        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
+++        vcgt.u8 q13, q9, q5
+++        vcgt.u8 q14, q10, q6
+++        vcgt.u8 q15, q11, q7
+++
+++        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
+++        vadd.s8 q1, q1, q13
+++        vmov.u8 q12, #2
+++        vadd.s8 q2, q2, q14
+++        vadd.s8 q3, q3, q15
+++
+++        vadd.s8 q0, q0, q12
+++        vadd.s8 q1, q1, q12
+++        @ whilst vmov dn, rm, rn exists it is a vfp instruction
+++        @ and causes a stall till neon pipe empty - so don't do that!
+++        vmov    d26[0], r4
+++        vmov    d26[1], r5
+++        vmov    d27[0], r6
+++        vmov    d27[1], r7
+++        vadd.s8 q2, q2, q12
+++        vuzp.8    q0, q1
+++        vmov.u8 q15, #128
+++        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
+++
+++        vtbl.8  d0, {d26}, d0
+++        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
+++
+++        vtbl.8  d1, {d26}, d1
+++        vadd.s8 q14, q5, q15
+++
+++        vtbl.8  d2, {d27}, d2
+++        vuzp.8    q2, q3
+++
+++        vtbl.8  d3, {d27}, d3
+++
+++        vtbl.8  d4, {d26}, d4
+++        vzip.8    q0, q1
+++
+++        vtbl.8  d5, {d26}, d5
+++        vqadd.s8 q0, q0, q12
+++        vqadd.s8 q1, q1, q14
+++        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
+++
+++        vtbl.8  d6, {d27}, d6
+++        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
+++
+++        vtbl.8  d7, {d27}, d7
+++        vzip.8   q2, q3
+++
+++        vsub.s8 q0, q0, q15
+++        vqadd.s8 q2, q2, q12
+++        vqadd.s8 q3, q3, q14
+++        vsub.s8 q1, q1, q15
+++        vsub.s8 q2, q2, q15
+++        vsub.s8 q3, q3, q15
+ +
+-+        diff32  q14, q15, q2, q3, q2, q3, q6, q7
+-+        diff32  q2, q3, q12, q13, q10, q11, q6, q7
+-+
+-+        vadd.s8  q2, q14
+-+        vadd.s8  q3, q15
+-+        table64
+ +.endm
+ +
+++function edge_w64_body
+++        edgeidx64
+++        vstm    r0, {q0-q3}
+++        add     r0, r0, r2
+++        bx       lr
+++endfunc
+++
+ +.macro init_edge_64
+-+        push   {r4-r5}
+-+        ldr    r12, [sp, #8] // height
+-+        ldr    r5, [sp, #12] // sao_offset_val_table
+-+        ldr    r4, [r5]
+-+        add    r5, #4
+-+        ldr    r5, [r5]
+++        push   {r4-r8,lr}
+++        ldr    r12, [sp, #24] // height
+++        ldr    r5,  [sp, #28] // sao_offset_val_table
+++        ldrd   r4, r5, [r5]
+++        mov    r6, r4
+++        mov    r7, r5
+ +.endm
+ +
+ +function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
+@@ -1962,11 +2918,10 @@ index 0000000..9c7808d
+ +        vext.8 q9, q5, q6, #1
+ +        vext.8 q10, q6, q7, #1
+ +        vext.8 q11, q7, q12, #1
+-+        edge_w64_body
+++        bl    edge_w64_body
+ +        bne   1b
+ +        vpop  {d8-d15}
+-+        pop   {r4-r5}
+-+        bx lr
+++        pop   {r4-r8,pc}
+ +endfunc
+ +
+ +function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
+@@ -1986,7 +2941,7 @@ index 0000000..9c7808d
+ +        vld1.8  {q8-q9}, [r1, :128]!
+ +        vld1.8  {q10-q11}, [r1, :128], r3
+ +        sub     r1, #32
+-+        edge_w64_body
+++        bl      edge_w64_body
+ +        // copy c to a
+ +        vmov.64 q0, q4
+ +        vmov.64 q1, q5
+@@ -1999,8 +2954,7 @@ index 0000000..9c7808d
+ +        vmov.64 q7, q11
+ +        bne   1b
+ +        vpop  {d8-d15}
+-+        pop   {r4-r5}
+-+        bx lr
+++        pop   {r4-r8,pc}
+ +endfunc
+ +
+ +function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
+@@ -2024,11 +2978,10 @@ index 0000000..9c7808d
+ +        vld1.8  {q8-q9}, [r1]!
+ +        vld1.8  {q10-q11}, [r1]
+ +        sub     r1, #33
+-+        edge_w64_body
+++        bl      edge_w64_body
+ +        bne   1b
+ +        vpop  {d8-d15}
+-+        pop   {r4-r5}
+-+        bx lr
+++        pop   {r4-r8,pc}
+ +endfunc
+ +
+ +function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
+@@ -2052,13 +3005,157 @@ index 0000000..9c7808d
+ +        vld1.8  {q8-q9}, [r1]!
+ +        vld1.8  {q10-q11}, [r1]
+ +        sub     r1, #31
+-+        edge_w64_body
+++        bl      edge_w64_body
+ +        bne   1b
+ +        vpop  {d8-d15}
+-+        pop   {r4-r5}
+-+        bx lr
+++        pop   {r4-r8,pc}
+++endfunc
+++
+++
+++@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
+++@   uint8_t *_dst,               r0
+++@   uint8_t *_src,               r1
+++@   ptrdiff_t stride_dst,        r2
+++@   ptrdiff_t stride_src,        r3
+++@   int height,                  sp[0]
+++@   int16_t *sao_offset_table_u,  sp[4]
+++@   int16_t *sao_offset_table_v); sp[8]
+++@   int eo                        sp[12]
+++
+++function ff_hevc_sao_edge_c_w64_neon_8, export=1
+++        push   {r4-r8,lr}     // 6 reg = 24
+++        ldr    r5,  [sp, #28] // sao_offset_val_table_u
+++        ldr    r7,  [sp, #32] // sao_offset_val_table_v
+++
+++        @ Load and rearrange offsets
+++        @ Also "convert" from 16bit to 8bit
+++        ldrb    r4, [r5, #2]
+++        ldrb    r8, [r5, #4]
+++        ldrb    r6, [r7, #2]
+++        ldrb    r12, [r7, #4]
+++        orr     r4, r4, r8, lsl #8
+++        orr     r6, r6, r12, lsl #8
+++        ldrb    r8, [r5, #6]
+++        ldrb    r12, [r7, #6]
+++        orr     r4, r4, r8, lsl #24
+++        orr     r6, r6, r12, lsl #24
+++        ldrb    r5, [r5, #8]
+++        ldrb    r7, [r7, #8]
+++
+++        ldr     r12, [sp, #36] // e0
+++        adr     r8, edge_c_tbl_w64
+++        ldr     r8, [r8, r12, lsl #2]
+++
+++        ldr     r12, [sp, #24] // height
+++        vpush   {d8-d15}
+++        mov     pc, r8
+++
+++edge_c_tbl_w64:
+++        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
+++        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
+++        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
+++        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
+++
+++ff_hevc_sao_edge_c_eo0_w64_neon_8:
+++        sub    r1, #8
+++1:      subs    r12, #1
+++        vld1.64  {d7}, [r1, :64]!
+++        vld1.64  {q4-q5}, [r1, :128]! // load c
+++        vld1.64  {q6-q7}, [r1, :128]!
+++        vld1.64  {d24}, [r1, :64], r3
+++        sub      r1, #72
+++        // load a
+++        vext.8 q0, q3, q4, #14
+++        vext.8 q1, q4, q5, #14
+++        vext.8 q2, q5, q6, #14
+++        vext.8 q3, q6, q7, #14
+++        // load b
+++        vext.8 q8, q4, q5, #2
+++        vext.8 q9, q5, q6, #2
+++        vext.8 q10, q6, q7, #2
+++        vext.8 q11, q7, q12, #2
+++        bl    edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r8,pc}
+++
+++ff_hevc_sao_edge_c_eo1_w64_neon_8:
+++        sub     r1, r3
+++        // load a
+++        vldm    r1, {q0-q3}
+++        add     r1, r3
+++        // load c
+++        vldm    r1, {q4-q7}
+++        add     r1, r3
+++1:      subs    r12, #1
+++        // load b
+++        vldm    r1, {q8-q11}
+++        add     r1, r3
+++        bl      edge_w64_body
+++        // copy c to a
+++        vmov.64 q0, q4
+++        vmov.64 q1, q5
+++        vmov.64 q2, q6
+++        vmov.64 q3, q7
+++        // copy b to c
+++        vmov.64 q4, q8
+++        vmov.64 q5, q9
+++        vmov.64 q6, q10
+++        vmov.64 q7, q11
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r8,pc}
+++
+++ff_hevc_sao_edge_c_eo2_w64_neon_8:
+++1:      sub     r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        sub     r1, #2
+++        vld1.8  {q0-q1}, [r1]!
+++        vld1.8  {q2-q3}, [r1], r3
+++        sub     r1, #30
+++        subs    r12, #1
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++        // load b
+++        add     r1, #2
+++        vld1.8  {q8-q9}, [r1]!
+++        vld1.8  {q10-q11}, [r1]
+++        sub     r1, #34
+++        bl      edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r8,pc}
+++
+++ff_hevc_sao_edge_c_eo3_w64_neon_8:
+++1:      sub     r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        add     r1, #2
+++        vld1.8  {q0-q1}, [r1]!
+++        vld1.8  {q2-q3}, [r1], r3
+++        sub     r1, #34
+++        subs    r12, #1
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++        // load b
+++        sub     r1, #2
+++        vld1.8  {q8-q9}, [r1]!
+++        vld1.8  {q10-q11}, [r1]
+++        sub     r1, #30
+++        bl      edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r8,pc}
+ +endfunc
+ +
+++
+ +.macro init_edge_32
+ +        ldr     r12, [sp, #4] // sao_offset_val_table
+ +        vld1.32 {d31}, [r12]
+@@ -2175,7 +3272,7 @@ index 0000000..9c7808d
+ +        vext.8  q7, q11, q12, #8
+ +        vext.8  q5, q10, q11, #7
+ +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
+ +        vadd.s8 q0, q12 //diff0 + diff1
+ +        vadd.s8 q1, q13
+ +        table32
+@@ -2215,7 +3312,7 @@ index 0000000..9c7808d
+ +        vext.8  q14, q12, q10, #7
+ +
+ +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
+ +
+ +        vadd.s8 q0, q12 //diff0 + diff1
+ +        vadd.s8 q1, q13
+@@ -2427,26 +3524,21 @@ index ce4bab2..b9b0c78 100644
+ +    .split          = h264_split,
+ +};
+ diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index b478065..88dd40b 100644
++index b478065..955e426 100644
+ --- a/libavcodec/hevc.c
+ +++ b/libavcodec/hevc.c
+-@@ -41,8 +41,186 @@
++@@ -41,8 +41,196 @@
+  #include "hevc.h"
+  #include "profiles.h"
+  
+ +#ifdef RPI
+ +  #include "rpi_qpu.h"
+-+  #include "rpi_user_vcsm.h"
+-+  // Move Inter prediction into separate pass
+-+  #define RPI_INTER
+-+
+-+  #ifdef RPI_INTER_QPU
+-+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+-+    #define RPI_MULTI_MAILBOX
+-+  #endif
+++  #include "rpi_shader.h"
+++  #include "rpi_shader_cmd.h"
+++  #include "rpi_zc.h"
+ +
+ +  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+-+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+++  #define RPI_CACHE_UNIF_MVS  1
+ +
+ +  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
+ +  //#define RPI_SIMULATE_QPUS
+@@ -2454,19 +3546,24 @@ index b478065..88dd40b 100644
+ +    #include "pthread.h"
+ +  #endif
+ +
+-+  static void rpi_execute_dblk_cmds(HEVCContext *s);
+-+  static void rpi_execute_transform(HEVCContext *s);
+-+  static void rpi_launch_vpu_qpu(HEVCContext *s);
+-+  static void rpi_execute_pred_cmds(HEVCContext *s);
+-+  static void rpi_execute_inter_cmds(HEVCContext *s);
+-+  static void rpi_begin(HEVCContext *s);
+-+  static void flush_frame(HEVCContext *s,AVFrame *frame);
+-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
+++  static void worker_core(HEVCContext * const s);
+ +
+++  // We can pred any block height but annoyingly if we we do then the TMU cache
+++  // explodes and it goes even slower :-(
+++  #if 0
+++  #define Y_P_MAX_H     16
+++  #define Y_B_MAX_H     16
+++  #else
+++  #define Y_P_MAX_H     64
+++  #define Y_B_MAX_H     64
+++  #endif
+ +#endif
+ +
+ +// #define DISABLE_MC
+ +
+++#define DISABLE_CHROMA 0
+++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
+++
+ +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+ +
+ +#ifndef av_mod_uintp2
+@@ -2477,45 +3574,65 @@ index b478065..88dd40b 100644
+ +#   define av_mod_uintp2   av_mod_uintp2_c
+ +#endif
+ +
+++#define Y_B_ONLY 0
+++
+  const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+  
+ +
+-+#ifdef RPI_INTER_QPU
+++#if RPI_INTER
+++
+++#define MC_DUMMY_X (-32)
+++#define MC_DUMMY_Y (-32)
+ +
+ +// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+ +// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+ +// For each block of 64*64 the smallest block size is 8x4
+ +// We also need an extra command for the setup information
+ +
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+-+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
+++#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4))
+ +// The QPU code for UV blocks only works up to a block width of 8
+ +#define RPI_CHROMA_BLOCK_WIDTH 8
+ +
+-+#define RPI_LUMA_COMMAND_WORDS 10
+-+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+-+
+ +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+ +
+ +// TODO Chroma only needs 4 taps
+ +
+ +// Actual filter goes -ve, +ve, +ve, -ve using these values
+-+static const uint32_t rpi_filter_coefs[8][1] = {
+-+        { ENCODE_COEFFS(   0,  64,   0,   0) },
+-+        { ENCODE_COEFFS(  2,  58,  10,  2) },
+-+        { ENCODE_COEFFS(  4,  54,  16,  2) },
+-+        { ENCODE_COEFFS(  6,  46,  28,  4) },
+-+        { ENCODE_COEFFS(  4,  36,  36,  4) },
+-+        { ENCODE_COEFFS(  4,  28,  46,  6) },
+-+        { ENCODE_COEFFS(  2,  16,  54,  4) },
+-+        { ENCODE_COEFFS(  2,  10,  58,  2) }
+++static const uint32_t rpi_filter_coefs[8] = {
+++        ENCODE_COEFFS(  0,  64,   0,  0),
+++        ENCODE_COEFFS(  2,  58,  10,  2),
+++        ENCODE_COEFFS(  4,  54,  16,  2),
+++        ENCODE_COEFFS(  6,  46,  28,  4),
+++        ENCODE_COEFFS(  4,  36,  36,  4),
+++        ENCODE_COEFFS(  4,  28,  46,  6),
+++        ENCODE_COEFFS(  2,  16,  54,  4),
+++        ENCODE_COEFFS(  2,  10,  58,  2)
+ +};
+ +
+++#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)))
+++
+ +#endif
+ +
+ +
+ +#ifdef RPI_WORKER
+ +
+++typedef struct worker_global_env_s
+++{
+++    volatile int arm_load;
+++    pthread_mutex_t lock;
+++
+++    unsigned int arm_y;
+++    unsigned int arm_c;
+++    unsigned int gpu_y;
+++    unsigned int gpu_c;
+++} worker_global_env_t;
+++
+++static worker_global_env_t worker_global_env =
+++{
+++    .lock = PTHREAD_MUTEX_INITIALIZER
+++};
+++
+++
+ +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+ +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+ +
+@@ -2594,17 +3711,7 @@ index b478065..88dd40b 100644
+ +      break;
+ +    }
+ +    LOG_ENTER
+-+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+    rpi_launch_vpu_qpu(s);
+-+    // Perform inter prediction
+-+    rpi_execute_inter_cmds(s);
+-+    // Wait for transform completion
+-+    vpu_wait(s->vpu_id);
+-+
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+++    worker_core(s);
+ +
+ +    worker_complete_job(s);
+ +    LOG_EXIT
+@@ -2617,7 +3724,7 @@ index b478065..88dd40b 100644
+  /**
+   * NOTE: Each function hls_foo correspond to the function foo in the
+   * specification (HLS stands for High Level Syntax).
+-@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
++@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+  /* free everything allocated  by pic_arrays_init() */
+  static void pic_arrays_free(HEVCContext *s)
+  {
+@@ -2650,36 +3757,40 @@ index b478065..88dd40b 100644
+      av_freep(&s->sao);
+      av_freep(&s->deblock);
+  
+-@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
++@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+      int ctb_count        = sps->ctb_width * sps->ctb_height;
+      int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+  
+ +#ifdef RPI
+-+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+-+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+-+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+++    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+++    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
+++    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+++    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+ +    int job;
+ +
+ +    av_assert0(sps);
+-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+-+    s->ctu_per_y_chan = s->max_ctu_count / 12;
+-+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
+++//    s->max_ctu_count = sps->ctb_width;
+++//    printf("CTB with=%d\n", sps->ctb_width);
+++//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+++    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
+++    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
+++    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
+++
+ +    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+      printf("Allocated %d\n",coefs_per_row);
+-+      for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+-+        if (!s->coeffs_buf_arm[job][0])
+-+            goto fail;
+-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+-+        if (!s->coeffs_buf_arm[job][2])
+-+            goto fail;
+-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+-+      }
+++        for(job=0;job<RPI_MAX_JOBS;job++) {
+++            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+++            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+++            if (!s->coeffs_buf_arm[job][0])
+++                goto fail;
+++
+++            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+++            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+++            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+++            if (!s->coeffs_buf_arm[job][2])
+++                goto fail;
+++            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+++            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+++        }
+ +    }
+ +#endif
+ +#ifdef RPI_DEBLOCK_VPU
+@@ -2726,8 +3837,6 @@ index b478065..88dd40b 100644
+ +
+ +            dvq->uv_setup_arm = (void*)p_arm;
+ +            dvq->uv_setup_vc = (void*)p_vc;
+-+
+-+            dvq->cmd_id = -1;
+ +        }
+ +
+ +        s->dvq_n = 0;
+@@ -2738,7 +3847,7 @@ index b478065..88dd40b 100644
+      s->bs_width  = (width  >> 2) + 1;
+      s->bs_height = (height >> 2) + 1;
+  
+-@@ -137,6 +422,29 @@ fail:
++@@ -137,6 +434,29 @@ fail:
+      return AVERROR(ENOMEM);
+  }
+  
+@@ -2768,7 +3877,52 @@ index b478065..88dd40b 100644
+  static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+  {
+      int i = 0;
+-@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
++@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
++ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
++ {
++     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
++-    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+++    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
++     int ret, i;
++ 
++     pic_arrays_free(s);
++@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++     switch (sps->pix_fmt) {
++     case AV_PIX_FMT_YUV420P:
++     case AV_PIX_FMT_YUVJ420P:
+++#if RPI_HEVC_SAND
+++        // Currently geometry calc is stuffed for big sizes
+++        if (sps->width < 2048 && sps->height <= 1088) {
+++            *fmt++ = AV_PIX_FMT_SAND128;
+++        }
+++#endif
++ #if CONFIG_HEVC_DXVA2_HWACCEL
++         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
++ #endif
++@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++         ret = ff_thread_get_format(s->avctx, pix_fmts);
++         if (ret < 0)
++             goto fail;
+++
++         s->avctx->pix_fmt = ret;
++     }
++     else {
++@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++         for(c_idx = 0; c_idx < c_count; c_idx++) {
++             int w = sps->width >> sps->hshift[c_idx];
++             int h = sps->height >> sps->vshift[c_idx];
+++            // ******** Very very nasty allocation kludge for plaited Chroma
++             s->sao_pixel_buffer_h[c_idx] =
++-                av_malloc((w * 2 * sps->ctb_height) <<
+++                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
++                           sps->pixel_shift);
++             s->sao_pixel_buffer_v[c_idx] =
++-                av_malloc((h * 2 * sps->ctb_width) <<
+++                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
++                           sps->pixel_shift);
++         }
++     }
++@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s)
+                  (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
+                  pred_weight_table(s, gb);
+              }
+@@ -2780,33 +3934,42 @@ index b478065..88dd40b 100644
+  
+              sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
+              if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
+-@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
++@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+      return 0;
+  }
+  
+ +#ifdef RPI
+ +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+ +{
+++    // U & V done on U call in the case of sliced frames
+++    if (rpi_sliced_frame(s->frame) && c_idx > 1)
+++        return;
+++
+ +    if (s->enable_rpi) {
+ +        HEVCLocalContext *lc = s->HEVClc;
+ +        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+ +        cmd->type = RPI_PRED_INTRA;
+ +        cmd->size = log2_trafo_size;
+-+        cmd->c_idx = c_idx;
+-+        cmd->x = x0;
+-+        cmd->y = y0;
+ +        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+-+        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+-+    } else {
+++        cmd->c_idx = c_idx;
+++        cmd->i_pred.x = x0;
+++        cmd->i_pred.y = y0;
+++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+++    }
+++    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
+++        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
+++    }
+++    else {
+ +        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+ +    }
+++
+ +}
+ +#endif
+ +
+  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                                int xBase, int yBase, int cb_xBase, int cb_yBase,
+                                int log2_cb_size, int log2_trafo_size,
+-@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+      if (lc->cu.pred_mode == MODE_INTRA) {
+          int trafo_size = 1 << log2_trafo_size;
+          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+@@ -2819,7 +3982,7 @@ index b478065..88dd40b 100644
+      }
+  
+      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+-@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                  if (lc->cu.pred_mode == MODE_INTRA) {
+                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+@@ -2831,7 +3994,7 @@ index b478065..88dd40b 100644
+                  }
+                  if (cbf_cb[i])
+                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                  if (lc->cu.pred_mode == MODE_INTRA) {
+                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+@@ -2843,7 +4006,7 @@ index b478065..88dd40b 100644
+                  }
+                  if (cbf_cr[i])
+                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                  if (lc->cu.pred_mode == MODE_INTRA) {
+                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                      trafo_size_h, trafo_size_v);
+@@ -2855,7 +4018,7 @@ index b478065..88dd40b 100644
+                  }
+                  if (cbf_cb[i])
+                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                  if (lc->cu.pred_mode == MODE_INTRA) {
+                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                  trafo_size_h, trafo_size_v);
+@@ -2867,7 +4030,7 @@ index b478065..88dd40b 100644
+                  }
+                  if (cbf_cr[i])
+                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+@@ -2914,17 +4077,162 @@ index b478065..88dd40b 100644
+              }
+          }
+      }
+-@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
++@@ -1275,47 +1675,120 @@ do {
++     return 0;
++ }
++ 
++-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+++
+++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++ {
++-    HEVCLocalContext *lc = s->HEVClc;
++     GetBitContext gb;
++-    int cb_size   = 1 << log2_cb_size;
++-    int stride0   = s->frame->linesize[0];
++-    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
++-    int   stride1 = s->frame->linesize[1];
++-    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++-    int   stride2 = s->frame->linesize[2];
++-    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
++-
++-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
++-                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
++-                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
++-                          s->ps.sps->pcm.bit_depth_chroma;
++-    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
++     int ret;
++ 
++-    if (!s->sh.disable_deblocking_filter_flag)
++-        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
++-
++     ret = init_get_bits(&gb, pcm, length);
++     if (ret < 0)
++         return ret;
++ 
++-    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
++-    if (s->ps.sps->chroma_format_idc) {
++-        s->hevcdsp.put_pcm(dst1, stride1,
+++#ifdef RPI
+++    if (rpi_sliced_frame(s->frame)) {
+++        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
+++                           s->frame->linesize[0],
+++                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+++
+++        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
+++                           s->frame->linesize[1],
++                            cb_size >> s->ps.sps->hshift[1],
++                            cb_size >> s->ps.sps->vshift[1],
++                            &gb, s->ps.sps->pcm.bit_depth_chroma);
++-        s->hevcdsp.put_pcm(dst2, stride2,
++-                           cb_size >> s->ps.sps->hshift[2],
++-                           cb_size >> s->ps.sps->vshift[2],
++-                           &gb, s->ps.sps->pcm.bit_depth_chroma);
++     }
+++    else
+++#endif
+++    {
+++        const int stride0   = s->frame->linesize[0];
+++        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+++        const int   stride1 = s->frame->linesize[1];
+++        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+++        const int   stride2 = s->frame->linesize[2];
+++        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+++
+++        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+++        if (s->ps.sps->chroma_format_idc) {
+++            s->hevcdsp.put_pcm(dst1, stride1,
+++                               cb_size >> s->ps.sps->hshift[1],
+++                               cb_size >> s->ps.sps->vshift[1],
+++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+++            s->hevcdsp.put_pcm(dst2, stride2,
+++                               cb_size >> s->ps.sps->hshift[2],
+++                               cb_size >> s->ps.sps->vshift[2],
+++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+++        }
++ 
+++    }
++     return 0;
++ }
++ 
+++#ifdef RPI
+++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
+++{
+++    int16_t * const coeffs = (buf_no != 3) ?
+++        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
+++        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
+++    s->num_coeffs[s->pass0_job][buf_no] += n;
+++    return coeffs;
+++}
+++#endif
+++
+++// x * 2^(y*2)
+++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+++{
+++    return x << (y * 2);
+++}
+++
+++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
+++{
+++    // Length in bits
+++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
+++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
+++
+++    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
+++
+++    if (!s->sh.disable_deblocking_filter_flag)
+++        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+++
+++#ifdef RPI
+++    if (s->enable_rpi) {
+++        // Copy coeffs
+++        const int blen = (length + 7) >> 3;
+++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
+++        // Allocation is in int16_t s
+++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+++        // sample this rounding doesn't affect the total size we need to allocate for
+++        // the coeff buffer
+++        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
+++        memcpy(coeffs, pcm, blen);
+++
+++        // Our coeff stash assumes that any partially allocated 64byte lump
+++        // is zeroed so make that true.
+++        {
+++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+++            if ((-(intptr_t)eopcm & 63) != 0)
+++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
+++        }
+++
+++        // Add command
+++        {
+++            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+++            cmd->type = RPI_PRED_I_PCM;
+++            cmd->size = log2_cb_size;
+++            cmd->i_pcm.src = coeffs;
+++            cmd->i_pcm.x = x0;
+++            cmd->i_pcm.y = y0;
+++            cmd->i_pcm.src_len = length;
+++        }
+++        return 0;
+++    }
+++#endif
+++
+++    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
+++}
+++
++ /**
++  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
++  *
++@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+   * @param luma_offset additive offset applied to the luma prediction value
+   */
+  
+-+#ifdef RPI_INTER
+-+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
+++#if RPI_INTER
+ +static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+ +                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+ +                        int block_w, int block_h, int luma_weight, int luma_offset)
+ +{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
+ +    cmd->cmd = RPI_CMD_LUMA_UNI;
+ +    cmd->dst = dst;
+ +    cmd->dststride = dststride;
+@@ -2941,9 +4249,10 @@ index b478065..88dd40b 100644
+ +
+ +static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+ +                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
+++                       const struct MvField * const current_mv)
+ +{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
+ +    cmd->cmd = RPI_CMD_LUMA_BI;
+ +    cmd->dst = dst;
+ +    cmd->dststride = dststride;
+@@ -2961,17 +4270,17 @@ index b478065..88dd40b 100644
+ +    cmd->ref_idx[1] = current_mv->ref_idx[1];
+ +}
+ +
+-+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+++static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
+++                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
+ +{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
+ +    cmd->cmd = RPI_CMD_CHROMA_UNI;
+ +    cmd->dst = dst0;
+ +    cmd->dststride = dststride;
+ +    cmd->src = src0;
+ +    cmd->srcstride = srcstride;
+-+    cmd->mv = current_mv->mv[reflist];
+++    cmd->mv = *mv;
+ +    cmd->x_off = x_off;
+ +    cmd->y_off = y_off;
+ +    cmd->block_w = block_w;
+@@ -2980,10 +4289,10 @@ index b478065..88dd40b 100644
+ +    cmd->offset = chroma_offset;
+ +}
+ +
+-+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+++static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+++                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
+ +{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
+ +    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+ +    cmd->dst = dst0;
+ +    cmd->dststride = dststride;
+@@ -3001,14 +4310,12 @@ index b478065..88dd40b 100644
+ +    cmd->ref_idx[1] = current_mv->ref_idx[1];
+ +}
+ +
+-+#else
+-+#define RPI_REDIRECT(fn) fn
+ +#endif
+ +
+  static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                          AVFrame *ref, const Mv *mv, int x_off, int y_off,
+                          int block_w, int block_h, int luma_weight, int luma_offset)
+-@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                             (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+      int idx              = ff_hevc_pel_weight[block_w];
+  
+@@ -3019,7 +4326,7 @@ index b478065..88dd40b 100644
+      x_off += mv->x >> 2;
+      y_off += mv->y >> 2;
+      src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+   * @param mv1 motion vector1 (relative to block position) to get pixel data from
+   * @param current_mv current motion vector structure
+   */
+@@ -3028,7 +4335,7 @@ index b478065..88dd40b 100644
+                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+  {
+-@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+      uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+      uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+  
+@@ -3039,7 +4346,7 @@ index b478065..88dd40b 100644
+      if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+          x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+          y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+-@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+      intptr_t _mx         = mx << (1 - hshift);
+      intptr_t _my         = my << (1 - vshift);
+  
+@@ -3050,7 +4357,7 @@ index b478065..88dd40b 100644
+      x_off += mv->x >> (2 + hshift);
+      y_off += mv->y >> (2 + vshift);
+      src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
++@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+      int hshift = s->ps.sps->hshift[1];
+      int vshift = s->ps.sps->vshift[1];
+  
+@@ -3061,13 +4368,422 @@ index b478065..88dd40b 100644
+      intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+      intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+      intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+-@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
++@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+      }
+  }
+  
+ -static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+ -                                int nPbW, int nPbH,
+ -                                int log2_cb_size, int partIdx, int idx)
+++
+++#if RPI_INTER
+++
+++static HEVCRpiLumaPred *
+++rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val)
+++{
+++    HEVCRpiLumaPred * yp = s->curr_pred_y;
+++    HEVCRpiLumaPred * ypt = yp + 1;
+++    for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) {
+++        if (ypt->load < yp->load)
+++            yp = ypt;
+++    }
+++
+++//        yp->load += load_val;
+++    ++yp->load;
+++    return yp;
+++}
+++
+++static void
+++rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
+++           const int nPbW, const int nPbH,
+++           const Mv *const mv,
+++           const int weight_mul,
+++           const int weight_offset,
+++           AVFrame *const src_frame)
+++{
+++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
+++
+++//    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
+++//                    mv, x0, y0, nPbW, nPbH,
+++//                    weight_mul, weight_offset);
+++
+++    {
+++        const unsigned int mx          = mv->x & 3;
+++        const unsigned int my          = mv->y & 3;
+++        const unsigned int my_mx       = (my << 8) | mx;
+++        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
+++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
+++        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
+++        uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
+++        const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
+++
+++        // Potentially we could change the assembly code to support taller sizes in one go
+++        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16)
+++        {
+++            const uint32_t src_yx_y = y1_m3 + start_y;
+++            int start_x = 0;
+++            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
+++
+++#if 1
+++            // As Y-pred operates on two independant 8-wide src blocks we can merge
+++            // this pred with the previous one if it the previous one is 8 pel wide,
+++            // the same height as the current block, immediately to the left of our
+++            // current dest block and mono-pred.
+++
+++            qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p;
+++            if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr)
+++            {
+++                const int bw = FFMIN(nPbW, 8);
+++                qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx;
+++
+++                last_y8_lx->next_src2_x = x1_m3;
+++                last_y8_lx->next_src2_y = src_yx_y;
+++                last_y8_lx->next_src2_base = src_vc_address_y;
+++                last_y8_p->p.w += bw;
+++                last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21);
+++                last_y8_p->p.wo2 = wo;
+++
+++                s->last_y8_p = NULL;
+++                s->last_y8_lx = NULL;
+++                start_x = bw;
+++#if RPI_TSTATS
+++                ++s->tstats.y_pred1_y8_merge;
+++#endif
+++            }
+++#endif
+++
+++            for (; start_x < nPbW; start_x += 16)
+++            {
+++                const int bw = FFMIN(nPbW - start_x, 16);
+++                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
+++                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
+++                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
+++#if RPI_TSTATS
+++                {
+++                    HEVCRpiStats *const ts = &s->tstats;
+++                    if (mx == 0 && my == 0)
+++                        ++ts->y_pred1_x0y0;
+++                    else if (mx == 0)
+++                        ++ts->y_pred1_x0;
+++                    else if (my == 0)
+++                        ++ts->y_pred1_y0;
+++                    else
+++                        ++ts->y_pred1_xy;
+++
+++                    if (nPbW > 8)
+++                        ++ts->y_pred1_wgt8;
+++                    else
+++                        ++ts->y_pred1_wle8;
+++
+++                    if (nPbH > 16)
+++                        ++ts->y_pred1_hgt16;
+++                    else
+++                        ++ts->y_pred1_hle16;
+++                }
+++#endif
+++                cmd_y[-1].next_fn = s->qpu_filter;
+++                cmd_lx->next_src1_x = x1_m3 + start_x;
+++                cmd_lx->next_src1_y = src_yx_y;
+++                cmd_lx->next_src1_base = src_vc_address_y;
+++                if (bw <= 8)
+++                {
+++                    cmd_lx->next_src2_x = MC_DUMMY_X;
+++                    cmd_lx->next_src2_y = MC_DUMMY_Y;
+++                    cmd_lx->next_src2_base = s->qpu_dummy_frame;
+++                }
+++                else
+++                {
+++                    cmd_lx->next_src2_x = x1_m3 + start_x + 8;
+++                    cmd_lx->next_src2_y = src_yx_y;
+++                    cmd_lx->next_src2_base = src_vc_address_y;
+++                }
+++                cmd_y->p.w = bw;
+++                cmd_y->p.h = bh;
+++                cmd_y->p.mymx21 = my2_mx2_my_mx;
+++                cmd_y->p.wo1 = wo;
+++                cmd_y->p.wo2 = wo;
+++                cmd_y->p.dst_addr =  dst_addr + start_x;
+++                yp->last_lx = cmd_y;
+++                yp->qpu_mc_curr = cmd_y + 1;
+++
+++                if (bw == 8) {
+++                    s->last_y8_lx = cmd_lx;
+++                    s->last_y8_p = cmd_y;
+++                }
+++            }
+++        }
+++    }
+++}
+++
+++static void
+++rpi_pred_y_b(HEVCContext * const s,
+++           const int x0, const int y0,
+++           const int nPbW, const int nPbH,
+++           const struct MvField *const mv_field,
+++           AVFrame *const src_frame,
+++           AVFrame *const src_frame2)
+++{
+++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
+++    const Mv * const mv  = mv_field->mv + 0;
+++    const Mv * const mv2 = mv_field->mv + 1;
+++
+++//    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
+++//           mv, x0, y0, nPbW, nPbH,
+++//           src_frame2, mv2, mv_field);
+++    {
+++        const unsigned int mx          = mv->x & 3;
+++        const unsigned int my          = mv->y & 3;
+++        const unsigned int my_mx = (my<<8) | mx;
+++        const unsigned int mx2          = mv2->x & 3;
+++        const unsigned int my2          = mv2->y & 3;
+++        const unsigned int my2_mx2 = (my2<<8) | mx2;
+++        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+++        const int x1 = x0 + (mv->x >> 2) - 3;
+++        const int y1 = y0 + (mv->y >> 2) - 3;
+++        const int x2 = x0 + (mv2->x >> 2) - 3;
+++        const int y2 = y0 + (mv2->y >> 2) - 3;
+++        const unsigned int ref_idx0 = mv_field->ref_idx[0];
+++        const unsigned int ref_idx1 = mv_field->ref_idx[1];
+++        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
+++                     s->sh.luma_offset_l1[ref_idx1] + 1;
+++        const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
+++        const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
+++
+++        uint32_t dst = get_vc_address_y(s->frame) + y_off;
+++        const uint32_t src1_base = get_vc_address_y(src_frame);
+++        const uint32_t src2_base = get_vc_address_y(src_frame2);
+++
+++        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H)
+++        {
+++            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
+++
+++            for (int start_x=0; start_x < nPbW; start_x += 8)
+++            { // B blocks work 8 at a time
+++                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
+++                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
+++                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
+++#if RPI_TSTATS
+++              {
+++                  HEVCRpiStats *const ts = &s->tstats;
+++                  const unsigned int mmx = mx | mx2;
+++                  const unsigned int mmy = my | my2;
+++                  if (mmx == 0 && mmy == 0)
+++                      ++ts->y_pred2_x0y0;
+++                  else if (mmx == 0)
+++                      ++ts->y_pred2_x0;
+++                  else if (mmy == 0)
+++                      ++ts->y_pred2_y0;
+++                  else
+++                      ++ts->y_pred2_xy;
+++
+++                  if (nPbH > 16)
+++                      ++ts->y_pred2_hgt16;
+++                  else
+++                      ++ts->y_pred2_hle16;
+++              }
+++#endif
+++              cmd_y[-1].next_fn = s->qpu_filter_b;
+++              cmd_lx->next_src1_x = x1 + start_x;
+++              cmd_lx->next_src1_y = y1 + start_y;
+++              cmd_lx->next_src1_base = src1_base;
+++              cmd_lx->next_src2_x = x2 + start_x;
+++              cmd_lx->next_src2_y = y2 + start_y;
+++              cmd_lx->next_src2_base = src2_base;
+++              cmd_y->p.w = FFMIN(nPbW - start_x, 8);
+++              cmd_y->p.h = bh;
+++              cmd_y->p.mymx21 = my2_mx2_my_mx;
+++              cmd_y->p.wo1 = wo1;
+++              cmd_y->p.wo2 = wo2;
+++              cmd_y->p.dst_addr =  dst + start_x;
+++              yp->last_lx = cmd_y;
+++              yp->qpu_mc_curr = cmd_y + 1;
+++          }
+++          dst += s->frame->linesize[0] * 16;
+++        }
+++    }
+++}
+++
+++
+++static HEVCRpiChromaPred *
+++rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val)
+++{
+++    HEVCRpiChromaPred * cp = s->curr_pred_c;
+++    HEVCRpiChromaPred * cpt = cp + 1;
+++    for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) {
+++        if (cpt->load < cp->load)
+++            cp = cpt;
+++    }
+++    // Actual use of load_val is noticably better but we haven't sorted Q length problems yet
+++    ++cp->load;
+++//    cp->load += load_val;
+++    return cp;
+++}
+++
+++static void
+++rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
+++  const int nPbW_c, const int nPbH_c,
+++  const Mv * const mv,
+++  const int16_t * const c_weights,
+++  const int16_t * const c_offsets,
+++  AVFrame * const src_frame)
+++{
+++
+++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
+++#if 0
+++    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
+++
+++    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
+++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
+++                c_weights[0], c_offsets[0]);
+++
+++    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
+++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
+++                c_weights[1], c_offsets[1]);
+++#endif
+++    {
+++        const int hshift           = s->ps.sps->hshift[1];
+++        const int vshift           = s->ps.sps->vshift[1];
+++
+++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+++        const uint32_t src_base_u = get_vc_address_u(src_frame);
+++        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
+++        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
+++        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
+++        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
+++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
+++
+++        for(int start_y=0;start_y < nPbH_c;start_y+=16)
+++        {
+++            const int bh = FFMIN(nPbH_c-start_y, 16);
+++
+++            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+++            {
+++                HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3);
+++                qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
+++                qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
+++                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+++
+++                u[-1].next_fn  = s->qpu_filter_uv;
+++                last_l0->next_src_x = x1_c + start_x;
+++                last_l0->next_src_y = y1_c + start_y;
+++                last_l0->next_src_base_c = src_base_u;
+++                u[0].p.h = bh;
+++                u[0].p.w = bw;
+++                u[0].p.coeffs_x = x_coeffs;
+++                u[0].p.coeffs_y = y_coeffs;
+++                u[0].p.wo_u = wo_u;
+++                u[0].p.wo_v = wo_v;
+++                u[0].p.dst_addr_c = dst_base_u + start_x * 2;
+++                cp->last_l0 = u;
+++                cp->qpu_mc_curr = u + 1;
+++            }
+++
+++            dst_base_u += s->frame->linesize[1] * 16;
+++        }
+++    }
+++  return;
+++}
+++
+++static void
+++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
+++  const int nPbW_c, const int nPbH_c,
+++  const struct MvField * const mv_field,
+++  const int16_t * const c_weights,
+++  const int16_t * const c_offsets,
+++  const int16_t * const c_weights2,
+++  const int16_t * const c_offsets2,
+++  AVFrame * const src_frame,
+++  AVFrame * const src_frame2)
+++{
+++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
+++#if 0
+++    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
+++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
+++
+++    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
+++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
+++#endif
+++    {
+++        const int hshift = s->ps.sps->hshift[1];
+++        const int vshift = s->ps.sps->vshift[1];
+++        const Mv * const mv = mv_field->mv + 0;
+++        const Mv * const mv2 = mv_field->mv + 1;
+++
+++        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
+++        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
+++        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+++        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+++
+++        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+++        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+++        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+++        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+++
+++        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
+++        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
+++
+++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
+++
+++        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
+++          const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
+++
+++          // We are allowed 3/4 powers of two as well as powers of 2
+++          av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
+++
+++          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) {
+++              const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+++
+++              HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3);
+++              qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
+++              qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
+++              qpu_mc_pred_c_t * const last_l1 = cp->last_l1;
+++
+++              u[-1].next_fn = s->qpu_filter_uv_b0;
+++              last_l0->next_src_x = x1_c + start_x;
+++              last_l0->next_src_y = y1_c + start_y;
+++              last_l0->next_src_base_c = get_vc_address_u(src_frame);
+++
+++              u[0].next_fn = 0;  // Ignored - 2 block cmd
+++              u[0].next_src_x = x2_c + start_x;
+++              u[0].next_src_y = y2_c + start_y;
+++              u[0].next_src_base_c = get_vc_address_u(src_frame2);
+++
+++              u[0].b0.h = (bh<16 ? bh : 16);
+++              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
+++              u[0].b0.coeffs_x = coefs0_x;
+++              u[0].b0.coeffs_y = coefs0_y;
+++              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
+++              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
+++              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
+++
+++              last_l1->next_src_x = x2_c + start_x;
+++              last_l1->next_src_y = y2_c + start_y;
+++              last_l1->next_src_base_c = get_vc_address_u(src_frame2);
+++
+++              u[1].b1.dummy0 = 0;  // w,h inherited from b0
+++              u[1].b1.coeffs_x = coefs1_x;
+++              u[1].b1.coeffs_y = coefs1_y;
+++              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
+++              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
+++              u[1].b1.dst_addr_c = dst_base_u + start_x * 2;
+++
+++              cp->last_l0 = u;
+++              cp->last_l1 = u + 1;
+++              cp->qpu_mc_curr = u + 2;
+++          }
+++
+++          dst_base_u += s->frame->linesize[1] * 16;
+++        }
+++    }
+++}
+++#endif
+++
+++
+++
+ +static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+ +                                const int nPbW, const int nPbH,
+ +                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+@@ -3080,7 +4796,7 @@ index b478065..88dd40b 100644
+      int merge_idx = 0;
+      struct MvField current_mv = {{{ 0 }}};
+  
+-@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+      int y_cb             = y0 >> log2_min_cb_size;
+      int x_pu, y_pu;
+      int i, j;
+@@ -3090,315 +4806,112 @@ index b478065..88dd40b 100644
+  
+      if (!skip_flag)
+          lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
+-@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+  
+ -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#ifdef RPI_LUMA_QPU
+++#if RPI_INTER
+ +        if (s->enable_rpi) {
+-+            const Mv * const mv    = &current_mv.mv[0];
+-+            const unsigned int mx          = mv->x & 3;
+-+            const unsigned int my          = mv->y & 3;
+-+            const unsigned int my_mx       = (my<<8) | mx;
+-+            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+-+            const int x1_m3 = x0 + (mv->x >> 2) - 3;
+-+            const int y1_m3 = y0 + (mv->y >> 2) - 3;
+-+            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
+-+            uint32_t * y = s->curr_y_mvs;
+-+
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
+-+
+-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  const int bw = nPbW-start_x;
+-+                  const int bh = nPbH-start_y;
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
+-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-+                }
+-+            }
+-+            s->curr_y_mvs = y;
+++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
+++              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+++              ref0->frame);
+ +        } else
+ +#endif
+ +        {
+-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
+++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+                      &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                      s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+                      s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+ +        }
+  
+          if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-+#ifdef RPI_INTER_QPU
+-+          if (s->enable_rpi) {
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[0];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+
+-+                uint32_t *u = s->curr_u_mvs;
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
+-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
+-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
+-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->curr_u_mvs = u;
+++#if RPI_INTER
+++            if (s->enable_rpi) {
+++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
+++                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+++                  ref0->frame);
+ +                return;
+ +            }
+ +#endif
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
++             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
+-         }
+-@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+  
+ -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+-+#ifdef RPI_LUMA_QPU
+++#if RPI_INTER
+ +        if (s->enable_rpi) {
+-+            const int reflist = 1;
+-+            const Mv *mv    = &current_mv.mv[reflist];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            uint32_t *y = s->curr_y_mvs;
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
+-+                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-+                }
+-+            }
+-+            s->curr_y_mvs = y;
+++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
+++              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+++              ref1->frame);
+ +        } else
+ +#endif
+-+
+ +        {
+-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
+++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+                      &current_mv.mv[1], x0, y0, nPbW, nPbH,
+                      s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                      s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+ +        }
+  
+          if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-+#ifdef RPI_INTER_QPU
+++#if RPI_INTER
+ +            if (s->enable_rpi) {
+-+                const int reflist = 1;
+-+                const int hshift           = s->ps.sps->hshift[1];
+-+                const int vshift           = s->ps.sps->vshift[1];
+-+                const Mv * const mv        = &current_mv.mv[reflist];
+-+                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                const intptr_t _mx         = mx << (1 - hshift);
+-+                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+
+-+                const int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                const int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+
+-+                uint32_t * u = s->curr_u_mvs;
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      const int bw = nPbW_c-start_x;
+-+                      const int bh = nPbH_c-start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
+-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
+-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
+-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->curr_u_mvs = u;
+++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
+++                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+++                  ref1->frame);
+ +                return;
+ +            }
+ +#endif
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
++             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+- 
+--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
+-         }
+-@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+  
+ -        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#ifdef RPI_LUMA_QPU
+-+        if (s->enable_rpi && 0) {
+-+            const Mv *mv    = &current_mv.mv[0];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            const Mv *mv2    = &current_mv.mv[1];
+-+            int mx2          = mv2->x & 3;
+-+            int my2          = mv2->y & 3;
+-+            int my2_mx2 = (my2<<8) + mx2;
+-+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            int x2 = x0 + (mv2->x >> 2);
+-+            int y2 = y0 + (mv2->y >> 2);
+-+            uint32_t *y = s->curr_y_mvs;
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-+                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+
+-+                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+-+                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
+-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
+-+                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
+-+
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-+                }
+-+            }
+-+            s->curr_y_mvs = y;
+++#if RPI_INTER
+++        if (s->enable_rpi) {
+++            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+ +        } else
+ +#endif
+ +        {
+-+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
+++            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                     ref1->frame, &current_mv.mv[1], &current_mv);
+ +        }
+  
+          if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-+#ifdef RPI_INTER_QPU
+++#if RPI_INTER
+ +          if (s->enable_rpi) {
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[0];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+
+-+                const Mv *mv2         = &current_mv.mv[1];
+-+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
+-+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
+-+                intptr_t _mx2         = mx2 << (1 - hshift);
+-+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
+-+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
+-+
+-+
+-+                uint32_t *u = s->curr_u_mvs;
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
+-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
+-+                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
+-+                      *u++ = 0;
+-+
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-+                      *u++ = rpi_filter_coefs[_mx2][0];
+-+                      *u++ = rpi_filter_coefs[_my2][0];
+-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
+-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
+-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
+-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
+-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
+-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->curr_u_mvs = u;
+++              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
+++                           &current_mv,
+++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+++                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+++                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+++                           ref0->frame,
+++                           ref1->frame);
+ +                return;
+ +            }
+ +#endif
+-+            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
++             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+  
+--            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+-+            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
+-         }
+-     }
+-@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
++@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
++                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
++                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
++                 if (s->ps.sps->pcm.loop_filter_disable_flag)
+++                {
++                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
+++                }
++ 
++                 if (ret < 0)
++                     return ret;
++@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+      lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+  }
+  
+@@ -3415,6 +4928,7 @@ index b478065..88dd40b 100644
+ +    s->num_dblk_cmds[job] = 0;
+ +}
+ +
+++#if 0
+ +static void rpi_execute_transform(HEVCContext *s)
+ +{
+ +    int i=2;
+@@ -3430,7 +4944,7 @@ index b478065..88dd40b 100644
+ +        s->hevcdsp.idct[5-2](coeffs, 32);
+ +    }*/
+ +
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+++    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+ +    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+ +                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+ +                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+@@ -3441,12 +4955,16 @@ index b478065..88dd40b 100644
+ +    for(i=0;i<4;i++)
+ +        s->num_coeffs[job][i] = 0;
+ +}
+++#endif
+++
+ +
+-+static void rpi_execute_pred_cmds(HEVCContext *s)
+++// I-pred, transform_and_add for all blocks types done here
+++// All ARM
+++static void rpi_execute_pred_cmds(HEVCContext * const s)
+ +{
+ +  int i;
+ +  int job = s->pass1_job;
+-+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+++  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+ +#ifdef RPI_WORKER
+ +  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+ +#else
+@@ -3454,43 +4972,65 @@ index b478065..88dd40b 100644
+ +#endif
+ +
+ +  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+-+      if (cmd->type == RPI_PRED_INTRA) {
+-+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+-+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+-+          lc->na.cand_left         = (cmd->na >> 3) & 1;
+-+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+-+          lc->na.cand_up           = (cmd->na >> 1) & 1;
+-+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+-+      } else {
+-+#ifdef RPI_PRECLEAR
+-+          int trafo_size = 1 << cmd->size;
+-+#endif
+-+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+++//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+++
+++      switch (cmd->type)
+++      {
+++          case RPI_PRED_INTRA:
+++              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
+++              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+++              lc->na.cand_left         = (cmd->na >> 3) & 1;
+++              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+++              lc->na.cand_up           = (cmd->na >> 1) & 1;
+++              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+++              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
+++                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+++              else
+++                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+++              break;
+++
+++          case RPI_PRED_ADD_RESIDUAL:
+++              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+ +#ifdef RPI_PRECLEAR
+-+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+++              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
+ +#endif
+++              break;
+++          case RPI_PRED_ADD_RESIDUAL_U:
+++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+++              break;
+++          case RPI_PRED_ADD_RESIDUAL_V:
+++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+++              break;
+++
+++          case RPI_PRED_I_PCM:
+++              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+++              break;
+++
+++          default:
+++              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+++              abort();
+ +      }
+ +  }
+ +  s->num_pred_cmds[job] = 0;
+ +}
+ +
+-+static void rpi_execute_inter_cmds(HEVCContext *s)
+++// Do any inter-pred that we want to do in software
+++// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
+++// All ARM
+++static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
+ +{
+-+    int job = s->pass1_job;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+-+    int n,cidx;
+++    unsigned int cidx;
+ +    AVFrame myref;
+ +    AVFrame myref1;
+ +    struct MvField mymv;
+-+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
+-+        printf("Overflow inter_cmds\n");
+-+        exit(-1);
+-+    }
+-+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
+++
+++    for(; n>0 ; n--, cmd++) {
+++        av_assert0(0);
+++
+ +        switch(cmd->cmd) {
+ +        case RPI_CMD_LUMA_UNI:
+++            if (b_only)
+++                break;
+ +            myref.data[0] = cmd->src;
+ +            myref.linesize[0] = cmd->srcstride;
+ +            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+@@ -3507,6 +5047,8 @@ index b478065..88dd40b 100644
+ +                       &myref1, &cmd->mv1, &mymv);
+ +            break;
+ +        case RPI_CMD_CHROMA_UNI:
+++            if (b_only)
+++                break;
+ +            mymv.mv[0] = cmd->mv;
+ +            chroma_mc_uni(s, cmd->dst,
+ +                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+@@ -3528,618 +5070,385 @@ index b478065..88dd40b 100644
+ +            break;
+ +        }
+ +    }
+-+    s->num_mv_cmds[job] = 0;
+ +}
+ +
+-+static void rpi_do_all_passes(HEVCContext *s)
+++static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
+ +{
+-+    // Kick off QPUs and VPUs
+-+    rpi_launch_vpu_qpu(s);
+-+    // Perform luma inter prediction
+-+    rpi_execute_inter_cmds(s);
+-+    // Wait for transform completion
+-+    vpu_wait(s->vpu_id);
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+-+    // Prepare next batch
+-+    rpi_begin(s);
+++    const int job = s->pass1_job;
+++
+++    if (!qpu_luma || luma_b_only)
+++        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
+++    s->num_mv_cmds_y[job] = 0;
+++    if (!qpu_chroma || chroma_b_only)
+++        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
+++    s->num_mv_cmds_c[job] = 0;
+ +}
+ +
+ +#endif
+ +
+ +#ifdef RPI
+++// Set initial uniform job values & zero ctu_count
+ +static void rpi_begin(HEVCContext *s)
+ +{
+++#if RPI_INTER
+ +    int job = s->pass0_job;
+ +    int i;
+-+#ifdef RPI_INTER_QPU
+-+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-+
+-+    for(i=0;i<8;i++) {
+-+        s->u_mvs[job][i] = s->mvs_base[job][i];
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = pic_width;
+-+        *s->u_mvs[job][i]++ = pic_height;
+-+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
+-+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
+-+        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-+    }
+-+    s->curr_u_mvs = s->u_mvs[job][0];
+-+#endif
+ +
+-+#ifdef RPI_LUMA_QPU
+-+    for(i=0;i<12;i++) {
+-+        // This needs to have a generally similar structure to the
+-+        // actual filter code as various pipelined bits need to land correctly
+-+        // when inserted by the filter requests
+-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
+-+        *s->y_mvs[job][i]++ = 0; // y_x
+-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
+-+        *s->y_mvs[job][i]++ = 0; // y2_x2
+-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
+-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
+-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
+-+        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
+-+        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
+-+        *s->y_mvs[job][i]++ = 0; // Next kernel
+++    const uint16_t pic_width_y        = s->ps.sps->width;
+++    const uint16_t pic_height_y       = s->ps.sps->height;
+++
+++    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
+++    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
+++
+++    for(i=0; i < QPU_N_UV;i++) {
+++        HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i;
+++        qpu_mc_pred_c_t * u = cp->qpu_mc_base;
+++
+++        // Chroma setup is a double block with L0 fetch
+++        // and other stuff in the 1st block and L1 fetch
+++        // in the 2nd along with a lot of dummy vars
+++        // This could be packed a lot tighter but it would make
+++        // L0, L1 management a lot harder
+++
+++        u->next_fn = 0;
+++        u->next_src_x = 0;
+++        u->next_src_y = 0;
+++        u->next_src_base_c = 0;
+++        u->s0.pic_cw = pic_width_c;
+++        u->s0.pic_ch = pic_height_c;
+++        u->s0.stride2 = rpi_sliced_frame_stride2(s->frame);
+++        u->s0.stride1 = s->frame->linesize[1];
+++        u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6;
+++        u->s0.dummy0 = 0;
+++        cp->last_l0 = u;
+++        ++u;
+++
+++        u->next_fn = 0;
+++        u->next_src_x = 0;
+++        u->next_src_y = 0;
+++        u->next_src_base_c = 0;
+++        u->s1.dummy0 = 0;
+++        u->s1.dummy1 = 0;
+++        u->s1.dummy2 = 0;
+++        u->s1.dummy3 = 0;
+++        u->s1.dummy4 = 0;
+++        u->s1.dummy5 = 0;
+++        cp->last_l1 = u;
+++        ++u;
+++
+++        cp->load = 0;
+++        cp->qpu_mc_curr = u;
+++    }
+++    s->curr_pred_c = NULL;
+++
+++    for(i=0;i < QPU_N_Y;i++) {
+++        HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i;
+++        qpu_mc_pred_y_t * y = yp->qpu_mc_base;
+++
+++        y->next_src1_x = 0;
+++        y->next_src1_y = 0;
+++        y->next_src1_base = 0;
+++        y->next_src2_x = 0;
+++        y->next_src2_y = 0;
+++        y->next_src2_base = 0;
+++        y->s.pic_h = pic_height_y;
+++        y->s.pic_w = pic_width_y;
+++        y->s.stride2 = rpi_sliced_frame_stride2(s->frame);
+++        y->s.stride1 = s->frame->linesize[0];
+++        y->s.wdenom = s->sh.luma_log2_weight_denom + 6;
+++        y->s.dummy0 = 0;
+++        y->next_fn = 0;
+++        yp->last_lx = y;
+++        ++y;
+++
+++        yp->load = 0;
+++        yp->qpu_mc_curr = y;
+ +    }
+-+    s->curr_y_mvs = s->y_mvs[job][0];
+++    s->curr_pred_y = NULL;
+++    s->last_y8_p = NULL;
+++    s->last_y8_lx = NULL;
+ +#endif
+ +    s->ctu_count = 0;
+ +}
+ +#endif
+ +
+-+#ifdef RPI_SIMULATE_QPUS
+ +
+-+static int32_t clipx(int x,int FRAME_WIDTH)
+++#if RPI_INTER
+++static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
+ +{
+-+	if (x<=0) return 0;
+-+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
+-+	return x;
+-+}
+++    unsigned int i;
+++    const uint32_t exit_fn = qpu_fn(mc_exit);
+++    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
+++    unsigned int tc = 0;
+++    HEVCRpiJob * const jb = s->jobs + job;
+++
+++    // Add final commands to Q
+++    for(i = 0; i != QPU_N_Y; ++i) {
+++        HEVCRpiLumaPred * const yp = jb->luma_mvs + i;
+++        qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx;
+++
+++        // We will always have had L0 if we have L1 so only test L0
+++        if (px != yp->qpu_mc_base)
+++            tc = 1;
+++
+++        yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
+++
+++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+++        px->next_src1_x = MC_DUMMY_X;
+++        px->next_src1_y = MC_DUMMY_Y;
+++        px->next_src1_base = s->qpu_dummy_frame;
+++        px->next_src2_x = MC_DUMMY_X;
+++        px->next_src2_y = MC_DUMMY_Y;
+++        px->next_src2_base = s->qpu_dummy_frame;
+++
+++        yp->last_lx = NULL;
+++    }
+ +
+-+static int32_t clipy(int y,int FRAME_HEIGHT)
+-+{
+-+	if (y<=0) return 0;
+-+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
+-+	return y;
+++    return tc;
+ +}
+ +
+-+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
+-+{
+-+   int32_t vsum = 0;
+-+   int x, y;
+++#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c
+++#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n)
+ +
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+++static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
+++{
+++    unsigned int i;
+++    const uint32_t exit_fn = qpu_fn(mc_exit_c);
+++    const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV));
+++    unsigned int tc = 0;
+++    HEVCRpiJob * const jb = s->jobs + job;
+++
+++    // Add final commands to Q
+++    for(i = 0; i != QPU_N_UV; ++i) {
+++        HEVCRpiChromaPred * const cp = jb->chroma_mvs + i;
+++        qpu_mc_pred_c_t *const p0 = cp->last_l0;
+++        qpu_mc_pred_c_t *const p1 = cp->last_l1;
+++
+++        // We will always have had L0 if we have L1 so only test L0
+++        if (p0 != cp->qpu_mc_base)
+++            tc = 1;
+++
+++        cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
+++
+++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+++        p0->next_src_x = MC_DUMMY_X;
+++        p0->next_src_y = MC_DUMMY_Y;
+++        p0->next_src_base_c = s->qpu_dummy_frame;
+++        p1->next_src_x = MC_DUMMY_X;
+++        p1->next_src_y = MC_DUMMY_Y;
+++        p1->next_src_base_c = s->qpu_dummy_frame;;
+++
+++        cp->last_l0 = NULL;
+++        cp->last_l1 = NULL;
+++    }
+ +
+-+      for (x = 0; x < 8; x++)
+-+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
+++    return tc;
+++}
+++#endif
+ +
+-+      vsum += lumaFilter[my][y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+round)>>denom)+offset;
+++#ifdef RPI
+ +
+-+   return av_clip_uint8( vsum );
+-+}*/
+ +
+-+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+++static void flush_frame(HEVCContext *s,AVFrame *frame)
+ +{
+-+  int32_t vsum = 0;
+-+  int x, y;
+-+  int chromaFilterH[4];
+-+  int chromaFilterV[4];
+-+  int i;
+-+  int offset_after = offset_weight>>16;
+-+  int weight = (offset_weight<<16)>>16;
+-+  for(i=0;i<4;i++) {
+-+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
+-+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
+-+  }
+++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+++  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+++  rpi_cache_flush_finish(rfe);
+++}
+ +
+-+   for (y = 0; y < 4; y++) {
+-+      int32_t hsum = 0;
+ +
+-+      for (x = 0; x < 4; x++)
+-+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+++// Core execution tasks
+++static void worker_core(HEVCContext * const s)
+++{
+++    worker_global_env_t * const wg = &worker_global_env;
+++    int arm_cost = 0;
+++//    vpu_qpu_wait_h sync_c;
+++    vpu_qpu_wait_h sync_y;
+++    int qpu_luma = 0;
+++    int qpu_chroma = 0;
+++    int gpu_load;
+++    int arm_load;
+++    static const int arm_const_cost = 2;
+++
+++//    static int z = 0;
+++
+++    const int job = s->pass1_job;
+++    unsigned int flush_start = 0;
+++    unsigned int flush_count = 0;
+++
+++    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++
+++    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
+++        vpu_qpu_job_add_vpu(vqj,
+++            vpu_get_fn(),
+++            vpu_get_constants(),
+++            s->coeffs_buf_vc[job][2],
+++            s->num_coeffs[job][2] >> 8,
+++            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+++            s->num_coeffs[job][3] >> 10,
+++            0);
+ +
+-+      vsum += chromaFilterV[y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+++        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+++    }
+ +
+-+   return vsum;
+-+}
+ +
+-+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
+++#if RPI_INTER
+++    pthread_mutex_lock(&wg->lock);
+ +
+-+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+-+{
+-+  int32_t vsum = 0;
+-+  int x, y;
+-+  int i;
+-+  int offset_after = offset_weight>>16;
+-+  int weight = (offset_weight<<16)>>16;
+++//    ++z;
+++    gpu_load = vpu_qpu_current_load();
+++    arm_load = avpriv_atomic_int_get(&wg->arm_load);
+++#if 0 // Y_B_ONLY
+++    qpu_luma =  gpu_load + 2 < arm_load;
+++    qpu_chroma = gpu_load < arm_load + 8;
+++#elif 0
+++    qpu_luma =  gpu_load < arm_load + 2;
+++    qpu_chroma = gpu_load < arm_load + 8;
+++#else
+++    qpu_chroma = 1;
+++    qpu_luma = 1;
+++#endif
+ +
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+++    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
+++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
+ +
+-+      for (x = 0; x < 8; x++)
+-+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+++    wg->gpu_c += qpu_chroma;
+++    wg->gpu_y += qpu_luma;
+++    wg->arm_c += !qpu_chroma;
+++    wg->arm_y += !qpu_luma;
+ +
+-+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+ +
+-+   return vsum;
+-+}
+++//    if ((z & 511) == 0) {
+++//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
+++//    }
+ +
+-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
+-+{
+-+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+-+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+-+  int pitch = frame->linesize[cIdx];
+-+  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
+-+    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
+-+  if (p>=base && p<base+pitch*pic_height) {
+-+    return frame->data[cIdx] + (p-base);
+-+  }
+-+  return NULL;
+-+}
+ +
+-+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
+-+{
+-+  SliceHeader *sh   = &s->sh;
+-+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
+-+  int i;
+-+  if (arm) return arm;
+-+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
+-+  {
+-+    for(i=0;i<sh->nb_refs[L0];i++) {
+-+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
+-+      if (arm) return arm;
+-+    }
+-+  }
+-+  if (sh->slice_type == B_SLICE) {
+-+    for(i=0;i<sh->nb_refs[L1];i++) {
+-+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
+-+      if (arm) return arm;
+++    {
+++        int (*d)[2] = s->dblk_cmds[job];
+++        unsigned int high=(*d)[1];
+++        int n;
+++
+++        flush_start = high;
+++        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+++            unsigned int y = (*d)[1];
+++            flush_start = FFMIN(flush_start, y);
+++            high=FFMAX(high,y);
+++        }
+++        // Avoid flushing past end of frame
+++        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
+ +    }
+-+  }
+-+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
+-+  exit(-1);
+-+  return NULL;
+-+}
+ +
+-+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+-+{
+-+  uint32_t next_kernel;
+-+  uint32_t x0;
+-+  uint32_t y0;
+-+  uint8_t *ref_u_base;
+-+  uint8_t *ref_v_base;
+-+  uint32_t frame_width = p[5];
+-+  uint32_t frame_height = p[6];
+-+  uint32_t pitch = p[7];
+-+  uint32_t dst_pitch = p[8];
+-+  int32_t offset_before = p[9];
+-+  int32_t denom = p[10];
+-+  uint32_t vpm_id = p[11];
+-+  uint32_t tmp_u_dst[256];
+-+  uint32_t tmp_v_dst[256];
+-+  while(1) {
+-+    p += 12;
+-+    next_kernel = p[0-12];
+-+    x0 = p[1-12];
+-+    y0 = p[2-12];
+-+    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
+-+      int x,y;
+-+      uint32_t width_height = p[5];
+-+      uint32_t hcoeffs = p[6];
+-+      uint32_t vcoeffs = p[7];
+-+      uint32_t offset_weight_u = p[8];
+-+      uint32_t offset_weight_v = p[9];
+-+      uint8_t *this_u_dst;
+-+      uint8_t *this_v_dst;
+-+      uint32_t width = width_height >> 16;
+-+      uint32_t height = (width_height << 16) >> 16;
+-+      ref_u_base = compute_arm_addr(s,p[3-12],1);
+-+      ref_v_base = compute_arm_addr(s,p[4-12],2);
+-+      if (next_kernel!=s->mc_filter_uv_b0)
+-+      {
+-+        this_u_dst = compute_arm_addr(s,p[10],1);
+-+        this_v_dst = compute_arm_addr(s,p[11],2);
+-+      }
+-+      for (y=0; y<height; ++y) {
+-+        for (x=0; x<width; ++x) {
+-+          if (next_kernel==s->mc_filter_uv) {
+-+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
+-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          } else if (next_kernel==s->mc_filter_uv_b0) {
+-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+-+            tmp_u_dst[x+y*16] = refa;
+-+            tmp_v_dst[x+y*16] = refb;
+-+          } else {
+-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
+-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          }
+-+        }
+-+      }
+-+    } else {
+-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+-+      break;
+-+    }
+-+  }
+-+}
+++#if !DISABLE_CHROMA
+++    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
+++    {
+++        HEVCRpiJob * const jb = s->jobs + job;
+++        const uint32_t code = qpu_fn(mc_setup_c);
+++        uint32_t * p;
+++        unsigned int i;
+++        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+ +
+-+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
+-+{
+-+  uint32_t next_kernel;
+-+  int y_x,y2_x2;
+-+  int x0;
+-+  int y0;
+-+  int x2;
+-+  int y2;
+-+  uint32_t *p0 = p;
+-+  uint8_t *ref_y_base;
+-+  uint8_t *ref_y2_base;
+-+  uint32_t frame_width_height = p[4];
+-+  uint32_t frame_width = frame_width_height>>16;
+-+  uint32_t frame_height = (frame_width_height<<16)>>16;
+-+  uint32_t pitch = p[5];
+-+  uint32_t dst_pitch = p[6];
+-+  int offset_shift = p[7];
+-+  int32_t offset_before = offset_shift>>16;
+-+  int32_t denom = (offset_shift<<16)>>16;
+-+  while(1) {
+-+    p += 9;
+-+    next_kernel = p[8-9];
+-+    y_x = p[0-9];
+-+    x0 = (y_x<<16)>>16;
+-+    y0 = y_x>>16;
+-+    y2_x2 = p[2-9];
+-+    x2 = (y2_x2<<16)>>16;
+-+    y2 = y2_x2>>16;
+-+
+-+    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
+-+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+-+      int x,y;
+-+      uint32_t width_height = p[4];
+-+      uint32_t my2_mx2_my_mx = p[5];
+-+      uint32_t offset_weight = p[6];
+-+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+-+      uint32_t width = width_height >> 16;
+-+      uint32_t height = (width_height << 16) >> 16;
+-+      uint8_t *dst_base = s->frame->data[0];
+-+      ref_y_base = compute_arm_addr(s,p[1-9],0);
+-+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
+-+      for (y=0; y<height; ++y) {
+-+        for (x=0; x<width; ++x) {
+-+          if (next_kernel==s->mc_filter) {
+-+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+-+            refa = av_clip_uint8(refa);
+-+            this_dst[x+y*dst_pitch] = refa;
+-+          }
+-+          else {
+-+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+-+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
+-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          }
+++        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+++            *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm);
+++            *p++ = code;
+ +        }
+-+      }
+-+    } else {
+-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+-+      break;
+-+    }
+-+  }
+-+}
+-+
+-+static void rpi_simulate_inter_qpu(HEVCContext *s)
+-+{
+-+  // First run the transform as normal
+-+  int i;
+-+  rpi_execute_transform(s);
+-+  for(i=0;i<8;i++)
+-+  {
+-+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
+-+  }
+-+  for(i=0;i<12;i++)
+-+  {
+-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
+-+  }
+-+}
+-+
+-+#endif
+ +
+-+#ifdef RPI_INTER_QPU
+++        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
+ +
+-+static void rpi_launch_vpu_qpu(HEVCContext *s)
+-+{
+-+    int k;
+-+    int job = s->pass1_job;
+-+    int i;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+-+#ifdef RPI_LUMA_QPU
+-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
+++#if RPI_CACHE_UNIF_MVS
+++        rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+ +#endif
+-+    if (s->sh.slice_type == I_SLICE) {
+-+#ifdef RPI_MULTI_MAILBOX
+-+      rpi_execute_transform(s);
+-+      return;
+-+#endif
+-+    }
+-+    for(k=0;k<8;k++) {
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+-+        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
+++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
+ +    }
+-+
+-+    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+
+-+#ifdef RPI_LUMA_QPU
+-+    for(k=0;k<12;k++) {
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+-+        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
+-+        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
+-+    }
+-+    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+#endif
+-+
+-+#ifdef RPI_SIMULATE_QPUS
+-+    rpi_simulate_inter_qpu(s);
+-+    return;
+ +#endif
+ +
+-+#ifdef RPI_MULTI_MAILBOX
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
+-+#else
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+-+#endif
+++// We can take a sync here and try to locally overlap QPU processing with ARM
+++// but testing showed a slightly negative benefit with noticable extra complexity
+++//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
+ +
+-+#if 1
+++    if (qpu_luma && mc_terminate_y(s, job) != 0)
+ +    {
+-+        unsigned int i;
+++        HEVCRpiJob * const jb = s->jobs + job;
+++        const uint32_t code = qpu_fn(mc_setup);
+ +        uint32_t * p;
+-+        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
+-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+++        unsigned int i;
+ +        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
+ +
+-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+-+            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
+-+            *p++ = code;
+-+        }
+-+
+-+        code = qpu_get_fn(QPU_MC_SETUP);
+ +        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
+-+            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
+++            *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm);
+ +            *p++ = code;
+ +        }
+ +
+-+        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
+-+            vpu_get_constants(),
+-+            s->coeffs_buf_vc[job][2],
+-+            s->num_coeffs[job][2] >> 8,
+-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+-+            s->num_coeffs[job][3] >> 10,
+-+            0,
+-+            // QPU job 1
+-+            QPU_N_UV,
+-+            mail_uv,
+-+            // QPU job 2
+-+            QPU_N_Y,
+-+            mail_y
+-+            );
+-+    }
+++        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
+ +
+-+#else
+-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
+-+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
+-+                                   qpu_get_fn(QPU_MC_SETUP_UV),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+#ifdef RPI_LUMA_QPU
+-+                                   qpu_get_fn(QPU_MC_SETUP),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
+-+#else
+-+                                   0,
+-+                                   0,0,0,0,
+-+                                   0,0,0,0,
+-+                                   0,0,0,0
+++#if RPI_CACHE_UNIF_MVS
+++        rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+ +#endif
+-+                                 );
+-+#endif
+-+    for(i=0;i<4;i++)
+-+        s->num_coeffs[job][i] = 0;
+-+#else
+-+#error Code rotted here
+-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
+-+      );
+-+#endif
+-+
+++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
+++    }
+ +
+-+}
+-+#else
+++    pthread_mutex_unlock(&wg->lock);
+ +
+-+#ifdef RPI
+-+static void rpi_launch_vpu_qpu(HEVCContext *s)
+-+{
+-+  rpi_execute_transform(s);
+-+}
+ +#endif
+ +
+-+#endif
+++    vpu_qpu_job_add_sync_this(vqj, &sync_y);
+ +
+-+#ifdef RPI
+++    // Having accumulated some commands - do them
+++    rpi_cache_flush_finish(rfe);
+++    vpu_qpu_job_finish(vqj);
+ +
+-+#ifndef RPI_FAST_CACHEFLUSH
+-+#error RPI_FAST_CACHEFLUSH is broken
+-+static void flush_buffer(AVBufferRef *bref) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+    gpu_cache_flush(p);
+-+}
+++    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
+++
+++#if Y_B_ONLY
+++    if (qpu_luma)
+++        vpu_qpu_wait(&sync_y);
+ +#endif
+++    // Perform inter prediction
+++    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
+ +
+-+static void flush_frame(HEVCContext *s,AVFrame *frame)
+-+{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+-+    int n = s->ps.sps->height;
+-+    int curr_y = 0;
+-+    int curr_uv = 0;
+-+    int n_uv = n >> s->ps.sps->vshift[1];
+-+    int sz,base;
+-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+    base = s->frame->linesize[1] * curr_uv;
+-+    iocache.s[0].handle = p.vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int)(p.arm) + base;
+-+    iocache.s[0].size  = sz;
+-+    p = get_gpu_mem_ptr_v(s->frame);
+-+    iocache.s[1].handle = p.vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = (int)(p.arm) + base;
+-+    iocache.s[1].size  = sz;
+-+    p = get_gpu_mem_ptr_y(s->frame);
+-+    sz = s->frame->linesize[0] * (n-curr_y);
+-+    base = s->frame->linesize[0] * curr_y;
+-+    iocache.s[2].handle = p.vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = (int)(p.arm) + base;
+-+    iocache.s[2].size  = sz;
+-+    vcsm_clean_invalid( &iocache );
+++    // Wait for transform completion
+++
+++    // Perform intra prediction and residual reconstruction
+++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
+++#if Y_B_ONLY
+++    if (!qpu_luma)
+++        vpu_qpu_wait(&sync_y);
+ +#else
+-+    flush_buffer(frame->buf[0]);
+-+    flush_buffer(frame->buf[1]);
+-+    flush_buffer(frame->buf[2]);
+++    vpu_qpu_wait(&sync_y);
+ +#endif
+++    rpi_execute_pred_cmds(s);
+++
+++    // Perform deblocking for CTBs in this row
+++    rpi_execute_dblk_cmds(s);
+++
+++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
+ +}
+ +
+-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
+++static void rpi_do_all_passes(HEVCContext *s)
+ +{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    int n;
+-+    int curr_y;
+-+    int curr_uv;
+-+    int n_uv;
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+-+    int sz,base;
+-+    int (*d)[2] = s->dblk_cmds[job];
+-+    int low=(*d)[1];
+-+    int high=(*d)[1];
+-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+-+        int y = (*d)[1];
+-+        low=FFMIN(low,y);
+-+        high=FFMAX(high,y);
+-+    }
+-+    curr_y = low;
+-+    n = high+(1 << s->ps.sps->log2_ctb_size);
+-+    curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+    n_uv = n >> s->ps.sps->vshift[1];
+-+
+-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+    base = s->frame->linesize[1] * curr_uv;
+-+    iocache.s[0].handle = p.vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int)(p.arm) + base;
+-+    iocache.s[0].size  = sz;
+-+    p = get_gpu_mem_ptr_v(s->frame);
+-+    iocache.s[1].handle = p.vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = (int)(p.arm) + base;
+-+    iocache.s[1].size  = sz;
+-+    p = get_gpu_mem_ptr_y(s->frame);
+-+    sz = s->frame->linesize[0] * (n-curr_y);
+-+    base = s->frame->linesize[0] * curr_y;
+-+    iocache.s[2].handle = p.vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = (int)(p.arm) + base;
+-+    iocache.s[2].size  = sz;
+-+
+-+    iocache.s[3].handle = p0->vcsm_handle;
+-+    iocache.s[3].cmd = 3; // clean+invalidate
+-+    iocache.s[3].addr = (int) p0->arm;
+-+    iocache.s[3].size  = p0->numbytes;
+-+    if (p1) {
+-+      iocache.s[4].handle = p1->vcsm_handle;
+-+      iocache.s[4].cmd = 3; // clean+invalidate
+-+      iocache.s[4].addr = (int) p1->arm;
+-+      iocache.s[4].size  = p1->numbytes;
+-+    }
+-+    if (p2) {
+-+      iocache.s[5].handle = p2->vcsm_handle;
+-+      iocache.s[5].cmd = 3; // clean+invalidate
+-+      iocache.s[5].addr = (int) p2->arm;
+-+      iocache.s[5].size  = p2->numbytes;
+-+    }
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    flush_buffer(frame->buf[0]);
+-+    flush_buffer(frame->buf[1]);
+-+    flush_buffer(frame->buf[2]);
+-+    gpu_cache_flush3(p0, p1, p2);
+-+#endif
+++    // Do the various passes - common with the worker code
+++    worker_core(s);
+++    // Prepare next batch
+++    rpi_begin(s);
+ +}
+ +
+++
+++
+ +#endif
+ +
+  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+  {
+      HEVCContext *s  = avctxt->priv_data;
+-@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+      int y_ctb       = 0;
+      int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+  
+ +#ifdef RPI
+-+    s->enable_rpi = s->ps.sps->bit_depth == 8
+-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
+++    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
+++        s->frame->format == AV_PIX_FMT_SAND128 &&
+++        !s->ps.pps->cross_component_prediction_enabled_flag;
+ +
+ +    if (!s->enable_rpi) {
+ +      if (s->ps.pps->cross_component_prediction_enabled_flag)
+@@ -4151,7 +5460,7 @@ index b478065..88dd40b 100644
+      if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+          av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+          return AVERROR_INVALIDDATA;
+-@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+          }
+      }
+  
+@@ -4166,26 +5475,25 @@ index b478065..88dd40b 100644
+      while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+          int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+  
+-@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
++         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
++ 
+++
++         ff_hevc_cabac_init(s, ctb_addr_ts);
++ 
++         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
++@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+          s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+  
+-+#ifdef RPI_INTER_QPU
+-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
+++#if RPI_INTER
+++        s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV;
+++        s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y;
+ +#endif
+ +
+          more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+ +
+-+#ifdef RPI_INTER_QPU
+-+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
+-+#endif
+-+
+ +#ifdef RPI
+ +        if (s->enable_rpi) {
+ +          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+@@ -4195,14 +5503,18 @@ index b478065..88dd40b 100644
+ +          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+ +          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+ +          s->ctu_count++;
+-+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
+ +
+ +          if ( s->ctu_count >= s->max_ctu_count ) {
+ +#ifdef RPI_WORKER
+-+            if (s->used_for_ref) {
+++            if (s->used_for_ref)
+++            {
+++//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
+++
+++//                worker_wait(s);
+ +              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+ +              // Pass on this job to worker thread
+ +              worker_submit_job(s);
+++
+ +              // Make sure we have space to prepare the next job
+ +              worker_pass0_ready(s);
+ +
+@@ -4224,7 +5536,7 @@ index b478065..88dd40b 100644
+          if (more_data < 0) {
+              s->tab_slice_address[ctb_addr_rs] = -1;
+              return more_data;
+-@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+  
+          ctb_addr_ts++;
+          ff_hevc_save_states(s, ctb_addr_ts);
+@@ -4249,12 +5561,25 @@ index b478065..88dd40b 100644
+ +        rpi_do_all_passes(s);
+ +    }
+ +
+++#if RPI_TSTATS
+++    {
+++        HEVCRpiStats *const ts = &s->tstats;
+++
+++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
+++        memset(ts, 0, sizeof(*ts));
+++    }
+++#endif
+++
+ +#endif
+ +
+      if (x_ctb + ctb_size >= s->ps.sps->width &&
+          y_ctb + ctb_size >= s->ps.sps->height)
+          ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+-@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
++@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+      s = s1->sList[self_id];
+      lc = s->HEVClc;
+  
+@@ -4266,16 +5591,32 @@ index b478065..88dd40b 100644
+      if(ctb_row) {
+          ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+  
+-@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
++@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+          if (ret < 0)
+              return ret;
+  
+-+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+++        // The definition of _N unit types is "non-reference for other frames
+++        // with the same temporal_id" so they may/will be ref frames for pics
+++        // with a higher temporal_id.
+++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+++            !(s->nal_unit_type == NAL_TRAIL_N ||
+ +                        s->nal_unit_type == NAL_TSA_N   ||
+ +                        s->nal_unit_type == NAL_STSA_N  ||
+ +                        s->nal_unit_type == NAL_RADL_N  ||
+ +                        s->nal_unit_type == NAL_RASL_N);
+ +
+++#if DEBUG_DECODE_N
+++        {
+++            static int z = 0;
+++            if (IS_IDR(s)) {
+++                z = 1;
+++            }
+++            if (z != 0 && z++ > DEBUG_DECODE_N) {
+++                s->is_decoded = 0;
+++                break;
+++            }
+++        }
+++#endif
+ +        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+ +            s->is_decoded = 0;
+ +            break;
+@@ -4283,27 +5624,30 @@ index b478065..88dd40b 100644
+          if (s->max_ra == INT_MAX) {
+              if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+                  s->max_ra = s->poc;
+-@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++         }
+      }
+  
+- fail:
++-fail:
+ -    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+++fail:  // Also success path
+ +    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+-+#ifdef RPI_INTER_QPU
+-+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+++#if RPI_INTER
+++        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
+ +#endif
+          ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+ -
+-+    } else if (s->ref) {
+-+#ifdef RPI_INTER_QPU
+++    }
+++#if RPI_INTER
+++    else if (s->ref && s->enable_rpi) {
+ +      // When running single threaded we need to flush the whole frame
+ +      flush_frame(s,s->frame);
+-+#endif
+ +    }
+++#endif
+      return ret;
+  }
+  
+-@@ -3064,6 +4625,41 @@ fail:
++@@ -3064,6 +4764,41 @@ fail:
+      return AVERROR(ENOMEM);
+  }
+  
+@@ -4345,7 +5689,7 @@ index b478065..88dd40b 100644
+  static av_cold int hevc_decode_free(AVCodecContext *avctx)
+  {
+      HEVCContext       *s = avctx->priv_data;
+-@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+  
+      av_freep(&s->cabac_state);
+  
+@@ -4356,29 +5700,26 @@ index b478065..88dd40b 100644
+ +#endif
+ +
+ +    for(i=0;i<RPI_MAX_JOBS;i++) {
+-+      av_freep(&s->unif_mv_cmds[i]);
+-+      av_freep(&s->univ_pred_cmds[i]);
+ +
+-+#ifdef RPI_INTER_QPU
+-+      if (s->unif_mvs[i]) {
+-+        gpu_free( &s->unif_mvs_ptr[i] );
+-+        s->unif_mvs[i] = 0;
+-+      }
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+      if (s->y_unif_mvs[i]) {
+-+        gpu_free( &s->y_unif_mvs_ptr[i] );
+-+        s->y_unif_mvs[i] = 0;
+-+      }
+++        av_freep(&s->unif_mv_cmds_y[i]);
+++        av_freep(&s->unif_mv_cmds_c[i]);
+++        av_freep(&s->univ_pred_cmds[i]);
+++
+++#if RPI_INTER
+++        gpu_free(&s->jobs[i].chroma_mvs_gptr);
+++        gpu_free(&s->jobs[i].luma_mvs_gptr);
+ +#endif
+ +    }
+ +
+++    vpu_qpu_term();
+++
+++    av_rpi_zc_uninit(avctx);
+ +#endif
+ +
+      for (i = 0; i < 3; i++) {
+          av_freep(&s->sao_pixel_buffer_h[i]);
+          av_freep(&s->sao_pixel_buffer_v[i]);
+-@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+      return 0;
+  }
+  
+@@ -4398,75 +5739,76 @@ index b478065..88dd40b 100644
+  {
+      HEVCContext *s = avctx->priv_data;
+      int i;
+-+    int job;
+++#ifdef RPI
+++    unsigned int job;
+++#endif
+  
+      s->avctx = avctx;
+  
+-@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
++@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+      s->HEVClcList[0] = s->HEVClc;
+      s->sList[0] = s;
+  
+ +#ifdef RPI
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+-+        if (!s->unif_mv_cmds[job])
+++    // Whilst FFmpegs init fn is only called once the close fn is called as
+++    // many times as we have threads (init_thread_copy is called for the
+++    // threads).  So to match init & term put the init here where it will be
+++    // called by both init & copy
+++    av_rpi_zc_init(avctx);
+++
+++    if (vpu_qpu_init() != 0)
+++        goto fail;
+++
+++    for(job = 0; job < RPI_MAX_JOBS; job++) {
+++        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
+++        if (!s->unif_mv_cmds_y[job])
+++            goto fail;
+++        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
+++        if (!s->unif_mv_cmds_c[job])
+ +            goto fail;
+ +        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+ +        if (!s->univ_pred_cmds[job])
+ +            goto fail;
+ +    }
+ +
+-+#ifdef RPI_INTER_QPU
+++#if RPI_INTER
+ +    // We divide the image into blocks 256 wide and 64 high
+ +    // We support up to 2048 widths
+ +    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+ +    // Also add space for the startup command for each stream.
+ +
+-+    {
+-+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+-+        uint32_t *p;
+-+		for(job=0;job<RPI_MAX_JOBS;job++) {
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+++    for (job = 0; job < RPI_MAX_JOBS; job++) {
+++        HEVCRpiJob * const jb = s->jobs + job;
+++#if RPI_CACHE_UNIF_MVS
+++        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
+++        gpu_malloc_cached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
+ +#else
+-+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+++        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
+++        gpu_malloc_uncached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
+ +#endif
+-+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
+ +
+-+          // Set up initial locations for uniform streams
+-+          p = s->unif_mvs[job];
+-+          for(i = 0; i < 8; i++) {
+-+            s->mvs_base[job][i] = p;
+-+            p += uv_commands_per_qpu;
+-+          }
+++        {
+++            qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm;
+++            for(i = 0; i < QPU_N_UV; i++) {
+++                jb->chroma_mvs[i].qpu_mc_base = p;
+++                jb->chroma_mvs[i].qpu_mc_curr = p;
+++                p += UV_COMMANDS_PER_QPU;
+++            }
+ +        }
+-+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+-+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+-+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+-+    }
+-+
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+    for(job=0;job<RPI_MAX_JOBS;job++)
+-+    {
+-+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+-+        uint32_t *p;
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+-+#else
+-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+-+#endif
+-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
+-+
+-+        // Set up initial locations for uniform streams
+-+        p = s->y_unif_mvs[job];
+-+        for(i = 0; i < 12; i++) {
+-+            s->y_mvs_base[job][i] = p;
+-+            p += y_commands_per_qpu;
+++        {
+++            qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm;
+++            for(i = 0; i < QPU_N_Y; i++) {
+++                jb->luma_mvs[i].qpu_mc_base = p;
+++                jb->luma_mvs[i].qpu_mc_curr = p;
+++                p += Y_COMMANDS_PER_QPU;
+++            }
+ +        }
+ +    }
+-+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+-+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+++    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
+++    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
+++    s->qpu_dummy_frame = qpu_fn(mc_setup_c);  // Use our code as a dummy frame
+++    s->qpu_filter = qpu_fn(mc_filter);
+++    s->qpu_filter_b = qpu_fn(mc_filter_b);
+ +#endif
+ +    //gpu_malloc_uncached(2048*64,&s->dummy);
+ +
+@@ -4481,8 +5823,30 @@ index b478065..88dd40b 100644
+      s->cabac_state = av_malloc(HEVC_CONTEXTS);
+      if (!s->cabac_state)
+          goto fail;
++@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
++     }
++ 
++     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++-            s->threads_type = FF_THREAD_FRAME;
++-        else
++-            s->threads_type = FF_THREAD_SLICE;
+++        s->threads_type = FF_THREAD_FRAME;
+++    else
+++        s->threads_type = FF_THREAD_SLICE;
++ 
++     return 0;
++ }
++@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = {
++     .update_thread_context = hevc_update_thread_context,
++     .init_thread_copy      = hevc_init_thread_copy,
++     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+++//                             0,
+++//                             AV_CODEC_CAP_FRAME_THREADS,
++                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
++     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++ };
+ diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index be91010..6b03ea8 100644
++index be91010..dd7d152 100644
+ --- a/libavcodec/hevc.h
+ +++ b/libavcodec/hevc.h
+ @@ -23,6 +23,9 @@
+@@ -4495,37 +5859,53 @@ index be91010..6b03ea8 100644
+  #include "libavutil/buffer.h"
+  #include "libavutil/md5.h"
+  
+-@@ -37,6 +40,29 @@
++@@ -37,6 +40,45 @@
+  #include "thread.h"
+  #include "videodsp.h"
+  
+ +// define RPI to split the CABAC/prediction/transform into separate stages
+-+#ifdef RPI
+++#ifndef RPI
+ +
+-+  #include "rpi_qpu.h"
+-+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
+-+  #define RPI_INTER_QPU
+++  #define RPI_INTER          0
+++  #define RPI_TSTATS         0
+++  #define RPI_HEVC_SAND      0
+ +
+-+  #ifdef RPI_INTER_QPU
+-+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+-+    #define RPI_LUMA_QPU
+-+  #endif
+++#else
+++
+++  #include "rpi_qpu.h"
+++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
+ +
+-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+-+  #define RPI_MAX_JOBS 2
+ +  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+ +  #define RPI_WORKER
+++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+++  // This has no effect unless RPI_WORKER is defined
+++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
+++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
+++  // free for the foreground to fill in.
+++  #define RPI_MAX_JOBS 2
+++
+ +  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
+++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
+++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
+ +//  #define RPI_DEBLOCK_VPU
+ +
+-+#endif
+++  #define RPI_VPU_DEBLOCK_CACHED 1
+++
+++  #if HAVE_NEON
+++  #define RPI_HEVC_SAND      1
+++  #else
+++  // Sand bust on Pi1 currently - reasons unknown
+++  #define RPI_HEVC_SAND      0
+++  #endif
+ +
+-+#define RPI_VPU_DEBLOCK_CACHED 1
+++  #define RPI_TSTATS 0
+++#endif
+ +
+  #define MAX_DPB_SIZE 16 // A.4.1
+  #define MAX_REFS 16
+  
+-@@ -660,17 +686,6 @@ typedef struct CodingUnit {
++@@ -660,17 +702,6 @@ typedef struct CodingUnit {
+      uint8_t cu_transquant_bypass_flag;
+  } CodingUnit;
+  
+@@ -4543,7 +5923,7 @@ index be91010..6b03ea8 100644
+  typedef struct NeighbourAvailable {
+      int cand_bottom_left;
+      int cand_left;
+-@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
++@@ -747,7 +778,17 @@ typedef struct HEVCFrame {
+      uint8_t flags;
+  } HEVCFrame;
+  
+@@ -4561,7 +5941,7 @@ index be91010..6b03ea8 100644
+      uint8_t cabac_state[HEVC_CONTEXTS];
+  
+      uint8_t stat_coeff[4];
+-@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
++@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext {
+  
+      int qPy_pred;
+  
+@@ -4569,7 +5949,7 @@ index be91010..6b03ea8 100644
+  
+      uint8_t ctb_left_flag;
+      uint8_t ctb_up_flag;
+-@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
++@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext {
+      int ct_depth;
+      CodingUnit cu;
+      PredictionUnit pu;
+@@ -4577,7 +5957,7 @@ index be91010..6b03ea8 100644
+  
+  #define BOUNDARY_LEFT_SLICE     (1 << 0)
+  #define BOUNDARY_LEFT_TILE      (1 << 1)
+-@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
++@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext {
+      int boundary_flags;
+  } HEVCLocalContext;
+  
+@@ -4589,13 +5969,15 @@ index be91010..6b03ea8 100644
+ +// This is a distance of 1536 pixels across the screen
+ +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+ +// but allocate more memory and increase the latency before data in the next frame can be processed
+-+#define RPI_NUM_CHUNKS 1
+++#define RPI_NUM_CHUNKS 4
+++#define RPI_CHUNK_SIZE 12
+ +
+ +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
+++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
+ +
+ +// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+-+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+++#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
+++#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
+ +// Each block can have an intra prediction and a transform_add command
+ +#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+ +// Worst case is 16x16 CTUs
+@@ -4612,53 +5994,118 @@ index be91010..6b03ea8 100644
+ +
+ +// Command for inter prediction
+ +typedef struct HEVCMvCmd {
+-+    int cmd;
+-+    uint8_t *dst;
+-+    ptrdiff_t dststride;
+++    uint8_t cmd;
+++    uint8_t block_w;
+++    uint8_t block_h;
+++    int8_t ref_idx[2];
+++    uint16_t dststride;
+++    uint16_t srcstride;
+++    uint16_t srcstride1;
+++    int16_t weight;
+++    int16_t offset;
+++    int16_t x_off;
+++    int16_t y_off;
+ +    uint8_t *src;
+-+    ptrdiff_t srcstride;
+-+    Mv mv;
+-+    int x_off;
+-+    int y_off;
+-+    int block_w;
+-+    int block_h;
+-+    int weight;
+-+    int offset;
+ +    uint8_t *src1;
+-+    ptrdiff_t srcstride1;
+++    uint8_t *dst;
+++    Mv mv;
+ +    Mv mv1;
+-+    int8_t ref_idx[2];
+ +} HEVCMvCmd;
+ +
+ +
+ +// Command for intra prediction and transform_add of predictions to coefficients
+-+#define RPI_PRED_TRANSFORM_ADD 0
+-+#define RPI_PRED_INTRA 1
+++enum rpi_pred_cmd_e
+++{
+++    RPI_PRED_ADD_RESIDUAL,
+++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+++    RPI_PRED_INTRA,
+++    RPI_PRED_I_PCM,
+++    RPI_PRED_CMD_MAX
+++};
+++
+ +typedef struct HEVCPredCmd {
+-+    uint8_t size;
+ +    uint8_t type;
+-+    uint8_t na;
+-+    uint8_t c_idx;
+-+    union {
+-+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t x;   // RPI_PRED_INTRA
+-+    };
+-+    union {
+-+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t y;   // RPI_PRED_INTRA
+-+    };
+++    uint8_t size;  // log2 "size" used by all variants
+++    uint8_t na;    // i_pred - but left here as they pack well
+++    uint8_t c_idx; // i_pred
+ +    union {
+-+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t stride;         // RPI_PRED_INTRA
+++        struct {  // TRANSFORM_ADD
+++            uint8_t * dst;
+++            const int16_t * buf;
+++            uint32_t stride;
+++        } ta;
+++        struct {  // INTRA
+++            uint16_t x;
+++            uint16_t y;
+++            enum IntraPredMode mode;
+++        } i_pred;
+++        struct {  // I_PCM
+++            uint16_t x;
+++            uint16_t y;
+++            const void * src;
+++            uint32_t src_len;
+++        } i_pcm;
+ +    };
+ +} HEVCPredCmd;
+ +
+ +#endif
+ +
+++#ifdef RPI
+++
+++struct qpu_mc_pred_c_s;
+++struct qpu_mc_pred_y_s;
+++
+++typedef struct HEVCRpiLumaPred
+++{
+++    struct qpu_mc_pred_y_s *qpu_mc_base;
+++    struct qpu_mc_pred_y_s *qpu_mc_curr;
+++    struct qpu_mc_pred_y_s *last_lx;
+++    unsigned int load;
+++} HEVCRpiLumaPred;
+++
+++typedef struct HEVCRpiChromaPred
+++{
+++    struct qpu_mc_pred_c_s *qpu_mc_base;
+++    struct qpu_mc_pred_c_s *qpu_mc_curr;
+++    struct qpu_mc_pred_c_s *last_l0;
+++    struct qpu_mc_pred_c_s *last_l1;
+++    unsigned int load;
+++} HEVCRpiChromaPred;
+++
+++typedef struct HEVCRpiJob {
+++    GPU_MEM_PTR_T chroma_mvs_gptr;
+++    GPU_MEM_PTR_T luma_mvs_gptr;
+++    HEVCRpiChromaPred chroma_mvs[QPU_N_UV];
+++    HEVCRpiLumaPred luma_mvs[QPU_N_Y];
+++} HEVCRpiJob;
+++
+++#if RPI_TSTATS
+++typedef struct HEVCRpiStats {
+++    int y_pred1_y8_merge;
+++    int y_pred1_xy;
+++    int y_pred1_x0;
+++    int y_pred1_y0;
+++    int y_pred1_x0y0;
+++    int y_pred1_wle8;
+++    int y_pred1_wgt8;
+++    int y_pred1_hle16;
+++    int y_pred1_hgt16;
+++    int y_pred2_xy;
+++    int y_pred2_x0;
+++    int y_pred2_y0;
+++    int y_pred2_x0y0;
+++    int y_pred2_hle16;
+++    int y_pred2_hgt16;
+++} HEVCRpiStats;
+++#endif
+++
+++#endif
+++
+  typedef struct HEVCContext {
+      const AVClass *c;  // needed by private avoptions
+      AVCodecContext *avctx;
+-@@ -798,13 +895,107 @@ typedef struct HEVCContext {
++@@ -798,13 +978,103 @@ typedef struct HEVCContext {
+  
+      HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+      HEVCLocalContext    *HEVClc;
+@@ -4676,7 +6123,8 @@ index be91010..6b03ea8 100644
+ +
+ +#ifdef RPI
+ +    int enable_rpi;
+-+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
+++    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
+++    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
+ +    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+ +    int buf_width;
+ +    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+@@ -4685,7 +6133,8 @@ index be91010..6b03ea8 100644
+ +    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+ +    int num_coeffs[RPI_MAX_JOBS][4];
+ +    int num_xfm_cmds[RPI_MAX_JOBS];
+-+    int num_mv_cmds[RPI_MAX_JOBS];
+++    int num_mv_cmds_y[RPI_MAX_JOBS];
+++    int num_mv_cmds_c[RPI_MAX_JOBS];
+ +    int num_pred_cmds[RPI_MAX_JOBS];
+ +    int num_dblk_cmds[RPI_MAX_JOBS];
+ +    int vpu_id;
+@@ -4695,29 +6144,23 @@ index be91010..6b03ea8 100644
+ +    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+ +    int ctu_per_y_chan; // Number of CTUs per luma QPU
+ +    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+-+#ifdef RPI_INTER_QPU
+-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-+
+-+    // _base pointers are to the start of the row
+-+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
+-+    // these pointers are to the next free space
+-+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
+-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
+-+    // Function pointers
+-+    uint32_t mc_filter_uv;
+-+    uint32_t mc_filter_uv_b0;
+-+    uint32_t mc_filter_uv_b;
+++
+++    HEVCRpiJob jobs[RPI_MAX_JOBS];
+++#if RPI_TSTATS
+++    HEVCRpiStats tstats;
+ +#endif
+-+#ifdef RPI_LUMA_QPU
+-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
+-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+-+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
+-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
+++#if RPI_INTER
+++    HEVCRpiChromaPred * curr_pred_c;
+++    HEVCRpiLumaPred * curr_pred_y;
+++    struct qpu_mc_pred_y_s * last_y8_p;
+++    struct qpu_mc_pred_y_s * last_y8_lx;
+++
+ +    // Function pointers
+-+    uint32_t mc_filter;
+-+    uint32_t mc_filter_b;
+++    uint32_t qpu_filter_uv;
+++    uint32_t qpu_filter_uv_b0;
+++    uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory
+++    uint32_t qpu_filter;
+++    uint32_t qpu_filter_b;
+ +#endif
+ +
+ +#ifdef RPI_WORKER
+@@ -4754,7 +6197,7 @@ index be91010..6b03ea8 100644
+ +        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+ +        int vpu_cmds_vc;
+ +
+-+        int cmd_id;
+++        vpu_qpu_wait_h cmd_id;
+ +    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+ +
+ +    struct dblk_vpu_q_s * dvq;
+@@ -4767,7 +6210,7 @@ index be91010..6b03ea8 100644
+      uint8_t *cabac_state;
+  
+      /** 1 if the independent slice segment header was successfully parsed */
+-@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
++@@ -922,6 +1192,9 @@ typedef struct HEVCContext {
+      uint32_t max_mastering_luminance;
+      uint32_t min_mastering_luminance;
+  
+@@ -4777,22 +6220,38 @@ index be91010..6b03ea8 100644
+  } HEVCContext;
+  
+  int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
+-@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                   int log2_trafo_size, enum ScanType scan_idx,
+                                   int c_idx);
+  
+-+#ifdef RPI_INTER_QPU
+-+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
+++#if RPI_INTER
+++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
+ +#endif
+ +
+  void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
+  
+  
++@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
++ extern const uint8_t ff_hevc_diag_scan8x8_x[64];
++ extern const uint8_t ff_hevc_diag_scan8x8_y[64];
++ 
+++#ifdef RPI
+++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
+++
+++// arm/hevc_misc_neon.S
+++// Neon coeff zap fn
+++#if HAVE_NEON
+++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+++#endif
+++
+++#endif
+++
++ #endif /* AVCODEC_HEVC_H */
+ diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 05b2821..e2f1f4e 100644
++index 05b2821..733efde 100644
+ --- a/libavcodec/hevc_cabac.c
+ +++ b/libavcodec/hevc_cabac.c
+-@@ -21,14 +21,72 @@
++@@ -21,14 +21,76 @@
+   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+   */
+  
+@@ -4805,6 +6264,10 @@ index 05b2821..e2f1f4e 100644
+  #include "hevc.h"
+ +#include "cabac_functions.h"
+ +
+++#ifdef RPI
+++#include "rpi_zc.h"
+++#endif
+++
+ +// BY22 is probably faster than simple bypass if the processor has
+ +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+ +// x86 has fast int divide
+@@ -4866,7 +6329,7 @@ index 05b2821..e2f1f4e 100644
+  /**
+   * number of bin by SyntaxElement.
+   */
+-@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
++@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
+      { 28, 36, 43, 49, 54, 58, 61, 63, },
+  };
+  
+@@ -5078,7 +6541,7 @@ index 05b2821..e2f1f4e 100644
+  void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
+  {
+      if (s->ps.pps->entropy_coding_sync_enabled_flag &&
+-@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
++@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+      return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
+  }
+  
+@@ -5104,7 +6567,7 @@ index 05b2821..e2f1f4e 100644
+  }
+  
+  int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
+-@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
++@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+      return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
+  }
+  
+@@ -5121,7 +6584,7 @@ index 05b2821..e2f1f4e 100644
+          ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+          ctx_shift = (log2_size + 1) >> 2;
+      } else {
+-@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
++@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
+      return value;
+  }
+  
+@@ -5147,7 +6610,7 @@ index 05b2821..e2f1f4e 100644
+  {
+      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+  }
+-@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
++@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
+  }
+  
+@@ -5160,7 +6623,7 @@ index 05b2821..e2f1f4e 100644
+ +
+ +#ifndef coeff_abs_level_remaining_decode_bypass
+ +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
+- {
+++{
+ +    CABACContext * const c = &s->HEVClc->cc;
+ +    uint32_t y;
+ +    unsigned int prefix;
+@@ -5201,7 +6664,7 @@ index 05b2821..e2f1f4e 100644
+ +#endif
+ +
+ +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+-+{
++ {
+ +    CABACContext * const c = &s->HEVClc->cc;
+      int prefix = 0;
+      int suffix = 0;
+@@ -5347,7 +6810,7 @@ index 05b2821..e2f1f4e 100644
+ +static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+ +{
+ +    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+- }
+++}
+ +#endif
+ +
+ +
+@@ -5442,6 +6905,45 @@ index 05b2821..e2f1f4e 100644
+ +    return i;
+ +}
+ +
+++#ifdef RPI
+++static void rpi_add_residual(HEVCContext * const s,
+++    const unsigned int log2_trafo_size, const unsigned int c_idx,
+++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+++{
+++    const AVFrame * const frame = s->frame;
+++    unsigned int stride = frame->linesize[c_idx];
+++    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+++    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+++    const int is_sliced = rpi_sliced_frame(frame);
+++    uint8_t * dst = !is_sliced ?
+++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+++        c_idx == 0 ?
+++            rpi_sliced_frame_pos_y(frame, x, y) :
+++            rpi_sliced_frame_pos_c(frame, x, y);
+++
+++//    if (c_idx != 0) {
+++//        return;
+++//    }
+++    if (s->enable_rpi) {
+++        HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+++        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+++        cmd->size = log2_trafo_size;
+++        cmd->c_idx = c_idx;
+++        cmd->ta.buf = coeffs;
+++        cmd->ta.dst = dst;
+++        cmd->ta.stride = stride;
+++    }
+++    else if (!is_sliced || c_idx == 0) {
+++        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    }
+++    else if (c_idx == 1) {
+++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    }
+++    else {
+++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    }
++ }
+++#endif
+  
+  void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                  int log2_trafo_size, enum ScanType scan_idx,
+@@ -5471,17 +6973,20 @@ index 05b2821..e2f1f4e 100644
+ +    const uint8_t *scan_x_cg, *scan_y_cg;
+ +    const xy_off_t * scan_xy_off;
+  
+++#ifndef RPI
+      ptrdiff_t stride = s->frame->linesize[c_idx];
+      int hshift = s->ps.sps->hshift[c_idx];
+      int vshift = s->ps.sps->vshift[c_idx];
+-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++-    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+++    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+                                            ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++-    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++#endif
+ +#ifdef RPI
+-+    //***** transform_skip_flag decoded later!
+-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
+++    int use_vpu;
+ +#endif
+-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+--    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++    int16_t *coeffs;
+ +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
+      int explicit_rdpcm_flag = 0;
+      int explicit_rdpcm_dir_flag;
+@@ -5496,39 +7001,12 @@ index 05b2821..e2f1f4e 100644
+      int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+                                           lc->tu.intra_pred_mode_c;
+  
++-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+ +    int prev_sig = 0;
+ +    const int c_idx_nz = (c_idx != 0);
+ +
+ +    int may_hide_sign;
+ +
+-+#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        int n = trafo_size * trafo_size;
+-+        if (use_vpu) {
+-+            // We support size 4 and size 5.
+-+            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
+-+            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
+-+            // num_coeffs is indexed by log2_trafo_size-2
+-+            if (log2_trafo_size == 4)
+-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+-+            else
+-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
+-+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+-+        } else {
+-+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+-+            s->num_coeffs[s->pass0_job][0] += n;
+-+        }
+-+    }
+-+    // We now do the memset after transform_add while we know the data is cached.
+-+    #ifdef RPI_PRECLEAR
+-+    #else
+-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+    #endif
+-+#else
+-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+#endif
+-+
+-+
+  
+      // Derive QP for dequant
+      if (!lc->cu.cu_transquant_bypass_flag) {
+@@ -5537,7 +7015,7 @@ index 05b2821..e2f1f4e 100644
+          static const uint8_t rem6[51 + 4 * 6 + 1] = {
+              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+-@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+          };
+          int qp_y = lc->qp_y;
+  
+@@ -5558,7 +7036,7 @@ index 05b2821..e2f1f4e 100644
+          }
+  
+          if (c_idx == 0) {
+-@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+              qp += s->ps.sps->qp_bd_offset;
+          }
+  
+@@ -5629,6 +7107,9 @@ index 05b2821..e2f1f4e 100644
+ +        may_hide_sign = 0;
+      }
+  
+++
+++
+++
+      if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+ -        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+ -        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+@@ -5646,7 +7127,7 @@ index 05b2821..e2f1f4e 100644
+                                             &last_significant_coeff_x, &last_significant_coeff_y);
+  
+      if (last_significant_coeff_x > 3) {
+-@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+          int last_x_c = last_significant_coeff_x & 3;
+          int last_y_c = last_significant_coeff_y & 3;
+  
+@@ -5703,27 +7184,53 @@ index 05b2821..e2f1f4e 100644
+ -    for (i = num_last_subset; i >= 0; i--) {
+ -        int n, m;
+ -        int x_cg, y_cg, x_c, y_c, pos;
+--        int implicit_non_zero_coeff = 0;
+--        int64_t trans_coeff_level;
+--        int prev_sig = 0;
+--        int offset = i << 4;
+--        int rice_init = 0;
+ +    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+- 
+--        uint8_t significant_coeff_flag_idx[16];
+--        uint8_t nb_significant_coeff_flag = 0;
+--
+--        x_cg = scan_x_cg[i];
+--        y_cg = scan_y_cg[i];
+--
+--        if ((i < num_last_subset) && (i > 0)) {
+--            int ctx_cg = 0;
+--            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+--                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+--            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+--                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+++
+ +    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+++
+++    {
+++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
+++#ifdef RPI
+++        use_vpu = 0;
+++        if (s->enable_rpi) {
+++            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
+++            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+++#if HAVE_NEON
+++            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+++#else
+++            memset(coeffs, 0, ccount * sizeof(int16_t));
+++#endif
+++        }
+++        else
+++#endif
+++        {
+++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+++            memset(coeffs, 0, ccount * sizeof(int16_t));
+++        }
+++    }
+++
+++    i = num_last_subset;
+++    do {
++         int implicit_non_zero_coeff = 0;
++-        int64_t trans_coeff_level;
++-        int prev_sig = 0;
++-        int offset = i << 4;
++-        int rice_init = 0;
+++        int n_end;
+  
++         uint8_t significant_coeff_flag_idx[16];
++-        uint8_t nb_significant_coeff_flag = 0;
++-
++-        x_cg = scan_x_cg[i];
++-        y_cg = scan_y_cg[i];
++-
++-        if ((i < num_last_subset) && (i > 0)) {
++-            int ctx_cg = 0;
++-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
++-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
++-
+ -            significant_coeff_group_flag[x_cg][y_cg] =
+ -                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+ -            implicit_non_zero_coeff = 1;
+@@ -5732,13 +7239,8 @@ index 05b2821..e2f1f4e 100644
+ -            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+ -             (x_cg == 0 && y_cg == 0));
+ -        }
+-+    i = num_last_subset;
+-+    do {
+-+        int implicit_non_zero_coeff = 0;
+-+        int n_end;
+- 
++-
+ -        last_scan_pos = num_coeff - offset - 1;
+-+        uint8_t significant_coeff_flag_idx[16];
+ +        unsigned int nb_significant_coeff_flag = 0;
+  
+          if (i == num_last_subset) {
+@@ -5824,7 +7326,7 @@ index 05b2821..e2f1f4e 100644
+                          if (log2_trafo_size == 3) {
+                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+                          } else {
+-@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                      }
+                  }
+              }
+@@ -5873,12 +7375,11 @@ index 05b2821..e2f1f4e 100644
+                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                      nb_significant_coeff_flag++;
+                  }
+-@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+              }
+          }
+  
+ -        n_end = nb_significant_coeff_flag;
+--
+ +        if (nb_significant_coeff_flag != 0) {
+ +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+ +                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+@@ -5926,6 +7427,9 @@ index 05b2821..e2f1f4e 100644
+ +                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+ +                }
+  
+++                // Probably not worth the overhead of starting by22 for just one value
+++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
++ 
+ -        if (n_end) {
+ -            int first_nz_pos_in_cg;
+ -            int last_nz_pos_in_cg;
+@@ -5936,9 +7440,6 @@ index 05b2821..e2f1f4e 100644
+ -            int sum_abs = 0;
+ -            int sign_hidden;
+ -            int sb_type;
+-+                // Probably not worth the overhead of starting by22 for just one value
+-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+- 
+ +                if (coded_val)
+ +                {
+ +                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+@@ -5949,13 +7450,18 @@ index 05b2821..e2f1f4e 100644
+ +                        const unsigned int c_rice_param = *stat_coeff >> 2;
+ +                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+  
+--            // initialize first elem of coeff_bas_level_greater1_flag
+--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+ +                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+ +                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+ +                    }
+ +                }
+  
++-            // initialize first elem of coeff_bas_level_greater1_flag
++-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+++                {
+++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+++                    const unsigned int scale_m = blk_scale[xy_off->scale];
++ 
+ -            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+ -                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+ -                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
+@@ -5963,11 +7469,7 @@ index 05b2821..e2f1f4e 100644
+ -                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+ -                c_rice_param = lc->stat_coeff[sb_type] / 4;
+ -            }
+-+                {
+-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
+- 
++-
+ -            if (!(i == num_last_subset) && greater1_ctx == 0)
+ -                ctx_set++;
+ -            greater1_ctx = 1;
+@@ -6052,10 +7554,6 @@ index 05b2821..e2f1f4e 100644
+ +
+ +                            sum_abs += last_coeff_abs_level_remaining + 1;
+ +                            *level = trans_coeff_level;
+-+
+-+                            if (stat_coeff != NULL)
+-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+-+                            stat_coeff = NULL;
+  
+ -            for (m = 0; m < n_end; m++) {
+ -                n = significant_coeff_flag_idx[m];
+@@ -6076,6 +7574,10 @@ index 05b2821..e2f1f4e 100644
+ -                                if (lc->stat_coeff[sb_type] > 0)
+ -                                    lc->stat_coeff[sb_type]--;
+ -                            rice_init = 1;
+++                            if (stat_coeff != NULL)
+++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                            stat_coeff = NULL;
+++
+ +                            if (trans_coeff_level > (3 << c_rice_param) &&
+ +                                (c_rice_param < 4 || rice_adaptation_enabled))
+ +                                ++c_rice_param;
+@@ -6176,7 +7678,7 @@ index 05b2821..e2f1f4e 100644
+  
+      if (lc->cu.cu_transquant_bypass_flag) {
+          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+-@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+          }
+      } else {
+@@ -6185,7 +7687,7 @@ index 05b2821..e2f1f4e 100644
+              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+                        log2_trafo_size == 2 &&
+                        lc->cu.pred_mode == MODE_INTRA;
+-@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                  for (i = 0; i < 8; i++)
+                      FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+              }
+@@ -6193,7 +7695,7 @@ index 05b2821..e2f1f4e 100644
+              s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+  
+              if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+-@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                  s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+              }
+          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+@@ -6221,7 +7723,7 @@ index 05b2821..e2f1f4e 100644
+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+              if (max_xy == 0)
+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                      col_limit = FFMIN(24, col_limit);
+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+              }
+@@ -6229,26 +7731,20 @@ index 05b2821..e2f1f4e 100644
+          }
+      }
+      if (lc->tu.cross_pf) {
+-@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+          }
+      }
+ +#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-+        cmd->type = RPI_PRED_TRANSFORM_ADD;
+-+        cmd->size = log2_trafo_size;
+-+        cmd->buf = coeffs;
+-+        cmd->dst = dst;
+-+        cmd->stride = stride;
+-+        return;
+-+    }
+-+#endif
+++    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+++#else
+      s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+++#endif
+  }
+  
++ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+ diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 1f33b0c..55a0315 100644
++index 1f33b0c..3143b4f 100644
+ --- a/libavcodec/hevc_filter.c
+ +++ b/libavcodec/hevc_filter.c
+ @@ -22,6 +22,12 @@
+@@ -6269,14 +7765,78 @@ index 1f33b0c..55a0315 100644
+  #include "bit_depth_template.c"
+  
+ +#ifdef RPI
+-+#include "rpi_user_vcsm.h"
+ +#include "rpi_qpu.h"
+++#include "rpi_zc.h"
+ +#endif
+ +
+  #define LUMA 0
+  #define CB 1
+  #define CR 2
+-@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
++     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
++ }
++ 
+++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
+++{
+++#ifdef RPI
+++    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
+++#else
+++    return s->ps.sps->pixel_shift;
+++#endif
+++}
+++
++ static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
++                      intptr_t stride_dst, intptr_t stride_src)
++ {
++@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
++                            int stride_src, int x, int y, int width, int height,
++                            int c_idx, int x_ctb, int y_ctb)
++ {
++-    int sh = s->ps.sps->pixel_shift;
+++    const unsigned int sh = pixel_shift(s, c_idx);
++     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
++     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
++ 
++@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s,
++         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
++         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
++         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
++-        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+++        const unsigned int sh = pixel_shift(s, c_idx);
+++        int len          = (min_pu_size >> hshift) << sh;
++         for (y = y_min; y < y_max; y++) {
++             for (x = x_min; x < x_max; x++) {
++                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
++                     int n;
++-                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
++-                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+++                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+++                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
++                     for (n = 0; n < (min_pu_size >> vshift); n++) {
++                         memcpy(src, dst, len);
++                         src += stride_src;
++@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s,
++ 
++ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++ {
++-    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++     HEVCLocalContext *lc = s->HEVClc;
++     int c_idx;
++     int edges[4];  // 0 left 1 top 2 right 3 bottom
++@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++     uint8_t right_tile_edge  = 0;
++     uint8_t up_tile_edge     = 0;
++     uint8_t bottom_tile_edge = 0;
+++#ifdef RPI
+++    const int sliced = rpi_sliced_frame(s->frame);
+++    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
+++#else
+++    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
+++#endif
++ 
++     edges[0]   = x_ctb == 0;
++     edges[1]   = y_ctb == 0;
+      edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+      edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
+  
+@@ -6287,7 +7847,301 @@ index 1f33b0c..55a0315 100644
+      if (restore) {
+          if (!edges[0]) {
+              left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+-@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         }
++     }
++ 
++-    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
++         int x0       = x >> s->ps.sps->hshift[c_idx];
++         int y0       = y >> s->ps.sps->vshift[c_idx];
++         int stride_src = s->frame->linesize[c_idx];
++@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
++         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
++         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++-        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
++-        int stride_dst;
+++        ptrdiff_t stride_dst;
++         uint8_t *dst;
++ 
+++#ifdef RPI
+++        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
+++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+++        uint8_t * const src = !sliced ?
+++                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
+++            c_idx == 0 ?
+++                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
+++                rpi_sliced_frame_pos_c(s->frame, x0, y0);
+++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+++            !sliced ? src - (1 << sh) :
+++            c_idx == 0 ?
+++                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
+++                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
+++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+++            !sliced ? src + (width << sh) :
+++            c_idx == 0 ?
+++                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
+++                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
+++
+++
+++        if (sliced && c_idx > 1) {
+++            break;
+++        }
+++#else
+++        const unsigned int sh = s->ps.sps->pixel_shift;
+++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+++        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
+++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
+++#endif
+++
++         switch (sao->type_idx[c_idx]) {
++         case SAO_BAND:
++             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                            x_ctb, y_ctb);
++             if (s->ps.pps->transquant_bypass_enable_flag ||
++                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
++-            dst = lc->edge_emu_buffer;
++-            stride_dst = 2*MAX_PB_SIZE;
++-            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
++-            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
++-                                            width, height);
++-            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++-                               x, y, width, height, c_idx);
+++                dst = lc->edge_emu_buffer;
+++                stride_dst = 2*MAX_PB_SIZE;
+++                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
+++#ifdef RPI
+++                if (sliced && c_idx != 0)
+++                {
+++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+++                                                    sao->offset_val[1], sao->band_position[1],
+++                                                    sao->offset_val[2], sao->band_position[2],
+++                                                    width, height);
+++                }
+++                else
+++#endif
+++                {
+++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+++                                                    width, height);
+++                }
+++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+++                                   x, y, width, height, c_idx);
++             } else {
++-            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
++-                                            width, height);
+++#ifdef RPI
+++                if (sliced && c_idx != 0)
+++                {
+++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+++                                                    sao->offset_val[1], sao->band_position[1],
+++                                                    sao->offset_val[2], sao->band_position[2],
+++                                                    width, height);
+++                }
+++                else
+++#endif
+++                {
+++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+++                                                    width, height);
+++                }
++             }
++             sao->type_idx[c_idx] = SAO_APPLIED;
++             break;
++@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         {
++             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
++             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
++-            int left_edge = edges[0];
++             int top_edge = edges[1];
++-            int right_edge = edges[2];
++             int bottom_edge = edges[3];
++-            int sh = s->ps.sps->pixel_shift;
++-            int left_pixels, right_pixels;
++ 
++             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
++             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
++ 
++             if (!top_edge) {
++-                int left = 1 - left_edge;
++-                int right = 1 - right_edge;
++-                const uint8_t *src1[2];
++                 uint8_t *dst1;
++-                int src_idx, pos;
+++                int src_idx;
+++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
++ 
++-                dst1 = dst - stride_dst - (left << sh);
++-                src1[0] = src - stride_src - (left << sh);
++-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
++-                pos = 0;
++-                if (left) {
+++                dst1 = dst - stride_dst;
+++
+++                if (src_l != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1, src1[src_idx], sh);
++-                    pos += (1 << sh);
+++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
++                 }
+++
++                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++                            SAO_APPLIED);
++-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
++-                if (right) {
++-                    pos += width << sh;
+++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+++
+++                if (src_r != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
++                 }
++             }
++             if (!bottom_edge) {
++-                int left = 1 - left_edge;
++-                int right = 1 - right_edge;
++-                const uint8_t *src1[2];
++-                uint8_t *dst1;
++-                int src_idx, pos;
+++                uint8_t * const dst1 = dst + height * stride_dst;
+++                int src_idx;
+++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+++                const unsigned int hoff = height * stride_src;
++ 
++-                dst1 = dst + height * stride_dst - (left << sh);
++-                src1[0] = src + height * stride_src - (left << sh);
++-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
++-                pos = 0;
++-                if (left) {
+++                if (src_l != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1, src1[src_idx], sh);
++-                    pos += (1 << sh);
+++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
++                 }
+++
++                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++                            SAO_APPLIED);
++-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
++-                if (right) {
++-                    pos += width << sh;
+++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+++
+++                if (src_r != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
++                 }
++             }
++-            left_pixels = 0;
++-            if (!left_edge) {
+++            if (src_l != NULL) {
++                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                     copy_vert(dst - (1 << sh),
++                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++                               sh, height, stride_dst, 1 << sh);
++                 } else {
++-                    left_pixels = 1;
+++                    copy_vert(dst - (1 << sh),
+++                              src_l,
+++                              sh, height, stride_dst, stride_src);
++                 }
++             }
++-            right_pixels = 0;
++-            if (!right_edge) {
+++            if (src_r != NULL) {
++                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                     copy_vert(dst + (width << sh),
++                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++                               sh, height, stride_dst, 1 << sh);
++                 } else {
++-                    right_pixels = 1;
+++                    copy_vert(dst + (width << sh),
+++                              src_r,
+++                              sh, height, stride_dst, stride_src);
++                 }
++             }
++ 
++-            copy_CTB(dst - (left_pixels << sh),
++-                     src - (left_pixels << sh),
++-                     (width + left_pixels + right_pixels) << sh,
+++            copy_CTB(dst,
+++                     src,
+++                     width << sh,
++                      height, stride_dst, stride_src);
++ 
++             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                            x_ctb, y_ctb);
++-            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++-                                            sao->eo_class[c_idx], width, height);
++-            s->hevcdsp.sao_edge_restore[restore](src, dst,
++-                                                stride_src, stride_dst,
++-                                                sao,
++-                                                edges, width,
++-                                                height, c_idx,
++-                                                vert_edge,
++-                                                horiz_edge,
++-                                                diag_edge);
+++#ifdef RPI
+++            if (sliced && c_idx != 0)
+++            {
+++                // Class always the same for both U & V (which is just as well :-))
+++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+++                                                width, height);
+++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+++                                                    stride_src, stride_dst,
+++                                                    sao,
+++                                                    edges, width,
+++                                                    height, c_idx,
+++                                                    vert_edge,
+++                                                    horiz_edge,
+++                                                    diag_edge);
+++            }
+++            else
+++#endif
+++            {
+++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+++                                                sao->eo_class[c_idx], width, height);
+++                s->hevcdsp.sao_edge_restore[restore](src, dst,
+++                                                    stride_src, stride_dst,
+++                                                    sao,
+++                                                    edges, width,
+++                                                    height, c_idx,
+++                                                    vert_edge,
+++                                                    horiz_edge,
+++                                                    diag_edge);
+++            }
++             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                x, y, width, height, c_idx);
++             sao->type_idx[c_idx] = SAO_APPLIED;
++@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++     }
++ }
++ 
+++// Returns 2 or 0.
++ static int get_pcm(HEVCContext *s, int x, int y)
++ {
++     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
++@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     uint8_t *src;
++     int x, y;
++     int chroma, beta;
++-    int32_t c_tc[2], tc[2];
+++    int32_t c_tc[4], tc[2];
++     uint8_t no_p[2] = { 0 };
++     uint8_t no_q[2] = { 0 };
++ 
++@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                  s->ps.sps->pcm.loop_filter_disable_flag) ||
+                 s->ps.pps->transquant_bypass_enable_flag;
+  
+@@ -6303,27 +8157,81 @@ index 1f33b0c..55a0315 100644
+      if (x0) {
+          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+          left_beta_offset = s->deblock[ctb - 1].beta_offset;
+-@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                          s->frame->linesize[LUMA],
+-                                                          beta, tc, no_p, no_q);
+-                 } else
++@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                 if (pcmf) {
++                     no_p[0] = get_pcm(s, x - 1, y);
++                     no_p[1] = get_pcm(s, x - 1, y + 4);
++                     no_q[0] = get_pcm(s, x, y);
++                     no_q[1] = get_pcm(s, x, y + 4);
++-                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
++-                                                         s->frame->linesize[LUMA],
++-                                                         beta, tc, no_p, no_q);
++-                } else
++-                    s->hevcdsp.hevc_v_loop_filter_luma(src,
++-                                                       s->frame->linesize[LUMA],
++-                                                       beta, tc, no_p, no_q);
+++                }
+++#ifdef RPI
+++                if (rpi_sliced_frame(s->frame)) {
+++
+++                    // This copes properly with no_p/no_q
+++                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
+++                                                     s->frame->linesize[LUMA],
+++                                                     beta, tc, no_p, no_q,
+++                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
+++                }
+++                else
+++#endif
+++                {
+++                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+++                    if (pcmf) {
+++                        // Standard DSP code is broken if no_p / no_q is set
+++                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+++                                                           s->frame->linesize[LUMA],
+++                                                           beta, tc, no_p, no_q);
+++                    }
+++                    else
+ +#ifdef RPI_DEBLOCK_VPU
+-+                if (s->enable_rpi_deblock) {
+-+                    uint8_t (*setup)[2][2][4];
+-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                    int a = ((y>>3) & 1) << 1;
+-+                    int b = (x>>3) & 1;
+-+                    setup = s->dvq->y_setup_arm[num16];
+-+                    setup[0][b][0][a] = beta;
+-+                    setup[0][b][0][a + 1] = beta;
+-+                    setup[0][b][1][a] = tc[0];
+-+                    setup[0][b][1][a + 1] = tc[1];
+-+                } else
+++                    if (s->enable_rpi_deblock) {
+++                        uint8_t (*setup)[2][2][4];
+++                        int num16 = (y>>4)*s->setup_width + (x>>4);
+++                        int a = ((y>>3) & 1) << 1;
+++                        int b = (x>>3) & 1;
+++                        setup = s->dvq->y_setup_arm[num16];
+++                        setup[0][b][0][a] = beta;
+++                        setup[0][b][0][a + 1] = beta;
+++                        setup[0][b][1][a] = tc[0];
+++                        setup[0][b][1][a + 1] = tc[1];
+++                    } else
+ +#endif
+-                     s->hevcdsp.hevc_v_loop_filter_luma(src,
+-                                                        s->frame->linesize[LUMA],
+-                                                        beta, tc, no_p, no_q);
+-@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+++                    {
+++                        s->hevcdsp.hevc_v_loop_filter_luma(src,
+++                                                           s->frame->linesize[LUMA],
+++                                                           beta, tc, no_p, no_q);
+++                    }
+++                }
++             }
++         }
++ 
++@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
++                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+++                src =
+++#ifdef RPI
+++                    rpi_sliced_frame(s->frame) ?
+++                        rpi_sliced_frame_pos_y(s->frame, x, y) :
+++#endif
+++                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                 if (pcmf) {
++                     no_p[0] = get_pcm(s, x, y - 1);
++                     no_p[1] = get_pcm(s, x + 4, y - 1);
++@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                                                           s->frame->linesize[LUMA],
+                                                           beta, tc, no_p, no_q);
+                  } else
+@@ -6343,7 +8251,113 @@ index 1f33b0c..55a0315 100644
+                      s->hevcdsp.hevc_h_loop_filter_luma(src,
+                                                         s->frame->linesize[LUMA],
+                                                         beta, tc, no_p, no_q);
+-@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     }
++ 
++     if (s->ps.sps->chroma_format_idc) {
+++#ifdef RPI
+++        if (rpi_sliced_frame(s->frame)) {
+++            const int v = 2;
+++            const int h = 2;
+++
+++            // vertical filtering chroma
+++            for (y = y0; y < y_end; y += 8 * v) {
+++                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
+++                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
+++                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
+++
+++                    if ((bs0 == 2) || (bs1 == 2)) {
+++                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
+++                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
+++                        unsigned int no_f = 0;
+++
+++                        // tc_offset here should be set to cur_tc_offset I think
+++                        const uint32_t tc4 =
+++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
+++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++
+++                        if (tc4 == 0)
+++                            continue;
+++
+++                        if (pcmf) {
+++                            no_f =
+++                                (get_pcm(s, x - 1, y) ? 1 : 0) |
+++                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
+++                                (get_pcm(s, x, y) ? 4 : 0) |
+++                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
+++                            if (no_f == 0xf)
+++                                continue;
+++                        }
+++
+++                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
+++                                                       s->frame->linesize[1],
+++                                                       tc4,
+++                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+++                                                       no_f);
+++                    }
+++                }
+++
+++                if (y == 0)
+++                    continue;
+++
+++                // horizontal filtering chroma
+++                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+++                x_end2 = x_end;
+++                if (x_end != s->ps.sps->width)
+++                    x_end2 = x_end - 8 * h;
+++
+++                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
+++                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+++                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+++                    if ((bs0 == 2) || (bs1 == 2)) {
+++                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
+++                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
+++                        const uint32_t tc4 =
+++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
+++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++                        unsigned int no_f = 0;
+++
+++                        if (tc4 == 0)
+++                            continue;
+++
+++                        if (pcmf) {
+++                            no_f =
+++                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
+++                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
+++                                (get_pcm(s, x,         y)     ? 4 : 0) |
+++                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
+++
+++                            if (no_f == 0xf)
+++                                continue;
+++                        }
+++
+++                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
+++                                                             s->frame->linesize[1],
+++                                                             tc4, no_f);
+++                    }
+++                }
+++            }
+++        }
+++        else
+++#endif
++         for (chroma = 1; chroma <= 2; chroma++) {
++             int h = 1 << s->ps.sps->hshift[chroma];
++             int v = 1 << s->ps.sps->vshift[chroma];
++@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
++                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
++-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+++                        src =
+++#ifdef RPI
+++                            rpi_sliced_frame(s->frame) ?
+++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#endif
+++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
++                         if (pcmf) {
++                             no_p[0] = get_pcm(s, x - 1, y);
++                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
++@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                                                                     s->frame->linesize[chroma],
+                                                                     c_tc, no_p, no_q);
+                          } else
+@@ -6367,7 +8381,21 @@ index 1f33b0c..55a0315 100644
+                      }
+                  }
+  
+-@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
++                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
++-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+++                        src =
+++#ifdef RPI
+++                            rpi_sliced_frame(s->frame) ?
+++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#endif
+++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++                         if (pcmf) {
++                             no_p[0] = get_pcm(s, x,           y - 1);
++                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
++@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                                                                     s->frame->linesize[chroma],
+                                                                     c_tc, no_p, no_q);
+                          } else
+@@ -6387,7 +8415,7 @@ index 1f33b0c..55a0315 100644
+                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
+                                                                   s->frame->linesize[chroma],
+                                                                   c_tc, no_p, no_q);
+-@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+      }
+  }
+  
+@@ -6457,7 +8485,7 @@ index 1f33b0c..55a0315 100644
+  
+  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+                                             int log2_trafo_size)
+-@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
+      int min_pu_width     = s->ps.sps->min_pu_width;
+      int min_tu_width     = s->ps.sps->min_tb_width;
+@@ -6467,8 +8495,9 @@ index 1f33b0c..55a0315 100644
+ -    int i, j, bs;
+ +    int i, j;
+ +    RefPicList *rpl      = s->ref->refPicList;
+-+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
+-+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
+++    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
+++    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
+++    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
+ +    int y_pu             = y0 >> log2_min_pu_size;
+ +    int x_pu             = x0 >> log2_min_pu_size;
+ +    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+@@ -6482,7 +8511,7 @@ index 1f33b0c..55a0315 100644
+  
+      boundary_upper = y0 > 0 && !(y0 & 7);
+      if (boundary_upper &&
+-@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+          boundary_upper = 0;
+  
+@@ -6559,7 +8588,7 @@ index 1f33b0c..55a0315 100644
+      boundary_left = x0 > 0 && !(x0 & 7);
+      if (boundary_left &&
+          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
+-@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+          boundary_left = 0;
+  
+@@ -6574,9 +8603,7 @@ index 1f33b0c..55a0315 100644
+ -        int xq_pu =  x0      >> log2_min_pu_size;
+ -        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+ -        int xq_tu =  x0      >> log2_min_tu_size;
+-+                               rpl;
+-+        MvField *left = curr - 1;
+- 
++-
+ -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+ -                int y_pu      = (y0 + i) >> log2_min_pu_size;
+ -                int y_tu      = (y0 + i) >> log2_min_tu_size;
+@@ -6594,18 +8621,20 @@ index 1f33b0c..55a0315 100644
+ -                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+ -            }
+ -    }
+-+        if (is_intra) {
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+-+                bs[j * s->bs_width >> 2] = 2;
+- 
++-
+ -    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
+ -        RefPicList *rpl = s->ref->refPicList;
+--
+++                               rpl;
+++        MvField *left = curr - 1;
++ 
+ -        // bs for TU internal horizontal PU boundaries
+ -        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+ -            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+ -            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+--
+++        if (is_intra) {
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+++                bs[j * s->bs_width >> 2] = 2;
++ 
+ -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+ -                int x_pu = (x0 + i) >> log2_min_pu_size;
+ -                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+@@ -6662,137 +8691,42 @@ index 1f33b0c..55a0315 100644
+          }
+      }
+  }
+-@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+  #undef CB
+  #undef CR
+  
+-+#if !defined(RPI_FAST_CACHEFLUSH)
+-+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
+-+static void flush_buffer_y(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
+-+    gpu_cache_flush(&p);
+-+}
+-+
+-+static void flush_buffer_u(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
+-+    gpu_cache_flush(&p);
+-+}
+-+
+-+static void flush_buffer_v(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
+-+    gpu_cache_flush(&p);
+-+}
+-+#endif
+-+#endif
+-+
+-+
+ +#ifdef RPI_DEBLOCK_VPU
+-+#error Not fixed yet
+-+
+ +// ff_hevc_flush_buffer_lines
+ +// flushes and invalidates all pixel rows in [start,end-1]
+ +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+ +{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+        struct vcsm_user_clean_invalid_s iocache = {};
+-+        int curr_y = start;
+-+        int n = end;
+-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+        int n_uv = n >> s->ps.sps->vshift[1];
+-+        int sz,base;
+-+        GPU_MEM_PTR_T p;
+-+        if (curr_uv < 0) curr_uv = 0;
+-+        if (n_uv<=curr_uv) { return; }
+-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+        base = s->frame->linesize[1] * curr_uv;
+-+        if (flush_chroma) {
+-+          p = get_gpu_mem_ptr_u(s->frame);
+-+          iocache.s[0].handle = p.vcsm_handle;
+-+          iocache.s[0].cmd = 3; // clean+invalidate
+-+          iocache.s[0].addr = (int)p.arm + base;
+-+          iocache.s[0].size  = sz;
+-+          p = get_gpu_mem_ptr_v(s->frame);
+-+          iocache.s[1].handle = p.vcsm_handle;
+-+          iocache.s[1].cmd = 3; // clean+invalidate
+-+          iocache.s[1].addr = (int)p.arm + base;
+-+          iocache.s[1].size  = sz;
+-+        }
+-+        if (flush_luma) {
+-+          p = get_gpu_mem_ptr_y(s->frame);
+-+          sz = s->frame->linesize[0] * (n-curr_y);
+-+          base = s->frame->linesize[0] * curr_y;
+-+          iocache.s[2].handle = p.vcsm_handle;
+-+          iocache.s[2].cmd = 3; // clean+invalidate
+-+          iocache.s[2].addr = (int)p.arm + base;
+-+          iocache.s[2].size  = sz;
+-+        }
+-+        vcsm_clean_invalid( &iocache );
+-+#else
+-+        if (flush_chroma) {
+-+          flush_buffer_u(s->frame);
+-+          flush_buffer_v(s->frame);
+-+        }
+-+        if (flush_luma) {
+-+          flush_buffer_y(s->frame);
+-+        }
+-+#endif
+++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
+++    rpi_cache_flush_finish(rfe);
+ +}
+ +#endif
+ +
+-+#ifdef RPI_INTER_QPU
+-+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+++#if RPI_INTER
+++
+++// Flush some lines of a reference frames
+++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
+ +{
+ +    if (s->enable_rpi && s->used_for_ref) {
+-+      // TODO make this use ff_hevc_flush_buffer_lines
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+        struct vcsm_user_clean_invalid_s iocache = {};
+-+        int curr_y = ((int *)f->progress->data)[0];
+-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+        int n_uv = n >> s->ps.sps->vshift[1];
+-+        int sz,base;
+-+        GPU_MEM_PTR_T p;
+-+        if (curr_uv < 0) curr_uv = 0;
+-+        if (n_uv<=curr_uv) { return; }
+-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+        base = s->frame->linesize[1] * curr_uv;
+-+        p = get_gpu_mem_ptr_u(s->frame);
+-+        iocache.s[0].handle = p.vcsm_handle;
+-+        iocache.s[0].cmd = 3; // clean+invalidate
+-+        iocache.s[0].addr = (int)p.arm + base;
+-+        iocache.s[0].size  = sz;
+-+        p = get_gpu_mem_ptr_v(s->frame);
+-+        iocache.s[1].handle = p.vcsm_handle;
+-+        iocache.s[1].cmd = 3; // clean+invalidate
+-+        iocache.s[1].addr = (int)p.arm + base;
+-+        iocache.s[1].size  = sz;
+-+
+-+#ifdef RPI_LUMA_QPU
+-+        p = get_gpu_mem_ptr_y(s->frame);
+-+        sz = s->frame->linesize[0] * (n-curr_y);
+-+        base = s->frame->linesize[0] * curr_y;
+-+        iocache.s[2].handle = p.vcsm_handle;
+-+        iocache.s[2].cmd = 3; // clean+invalidate
+-+        iocache.s[2].addr = (int)p.arm + base;
+-+        iocache.s[2].size  = sz;
+-+#endif
+-+        vcsm_clean_invalid( &iocache );
+-+#else
+-+        flush_buffer_u(s->frame);
+-+        flush_buffer_v(s->frame);
+-+#ifdef RPI_LUMA_QPU
+-+        flush_buffer_y(s->frame);
+-+#endif
+-+
+-+#endif
+-+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+++        const int d0 = ((int *)f->progress->data)[0];
+++        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
+++
+++        if (curr_y < (unsigned int)f->f->height) {
+++            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
+++            rpi_cache_flush_finish(rfe);
+++        }
+ +    }
+ +}
+ +#endif
+ +
+ +#ifdef RPI_DEBLOCK_VPU
+-+#error XXX
+ +/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+ +static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+ +{
+@@ -6821,16 +8755,19 @@ index 1f33b0c..55a0315 100644
+ +  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+ +  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+ +  s->dvq->vpu_cmds_arm[2][5] = 4;
+++
+ +  // Call VPU
+-+  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
+++  {
+++      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
+++      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
+++      vpu_qpu_job_finish(vqj);
+++  }
+ +
+ +  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+ +  s->dvq = s->dvq_ents + s->dvq_n;
+ +
+-+  if (s->dvq->cmd_id != -1) {
+-+      vpu_wait(s->dvq->cmd_id);
+-+      s->dvq->cmd_id = -1;
+-+  }
+++  vpu_qpu_wait(&s->dvq->cmd_id);
+ +}
+ +
+ +#endif
+@@ -6859,14 +8796,14 @@ index 1f33b0c..55a0315 100644
+      if (s->ps.sps->sao_enabled) {
+          int y_end = y >= s->ps.sps->height - ctb_size;
+          if (y && x)
+-@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+              sao_filter_CTB(s, x - ctb_size, y);
+          if (y && x_end) {
+              sao_filter_CTB(s, x, y - ctb_size);
+ -            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type & FF_THREAD_FRAME ) {
+-+#ifdef RPI_INTER_QPU
+-+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
+++            if (s->threads_type == FF_THREAD_FRAME ) {
+++#if RPI_INTER
+++                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
+ +#endif
+                  ff_thread_report_progress(&s->ref->tf, y, 0);
+ +            }
+@@ -6874,14 +8811,14 @@ index 1f33b0c..55a0315 100644
+          if (x_end && y_end) {
+              sao_filter_CTB(s, x , y);
+ -            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type & FF_THREAD_FRAME ) {
+-+#ifdef RPI_INTER_QPU
+-+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
+++            if (s->threads_type == FF_THREAD_FRAME ) {
+++#if RPI_INTER
+++                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
+ +#endif
+                  ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+ +            }
+ +        }
+-+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+++    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
+ +        //int newh = y + ctb_size - 4;
+ +        //int currh = s->ref->tf.progress->data[0];
+ +        //if (((y + ctb_size)&63)==0)
+@@ -6892,15 +8829,15 @@ index 1f33b0c..55a0315 100644
+ +            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+ +          }
+ +        } else {
+-+#ifdef RPI_INTER_QPU
+-+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+++#if RPI_INTER
+++          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+ +#endif
+ +          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+          }
+ -    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+ +#else
+-+#ifdef RPI_INTER_QPU
+-+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+++#if RPI_INTER
+++        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+ +        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+ +#endif
+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+@@ -6910,10 +8847,23 @@ index 1f33b0c..55a0315 100644
+  
+  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+ diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
+-index 83f2ec2..6882a8d 100644
++index 83f2ec2..bcf53dc 100644
+ --- a/libavcodec/hevc_ps.c
+ +++ b/libavcodec/hevc_ps.c
+-@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
++     switch (sps->bit_depth) {
++     case 8:
++         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+++#if RPI_HEVC_SAND
+++        // *** Horrid kludge s.t. we start out with sand format
+++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
+++#else
++         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+++#endif
++         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
++         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
++        break;
++@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+      sps->amp_enabled_flag = get_bits1(gb);
+      sps->sao_enabled      = get_bits1(gb);
+  
+@@ -6923,7 +8873,7 @@ index 83f2ec2..6882a8d 100644
+      if (sps->pcm_enabled_flag) {
+          sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+ diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
+-index 9d773d9..a6534a9 100644
++index 9d773d9..c4d7250 100644
+ --- a/libavcodec/hevcdsp.c
+ +++ b/libavcodec/hevcdsp.c
+ @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+@@ -7047,7 +8997,68 @@ index 9d773d9..a6534a9 100644
+  void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+  {
+  #undef FUNC
+-@@ -257,6 +371,8 @@ int i = 0;
++@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
++     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++ 
+++#ifndef RPI
+++#define SLICED_LOOP_FILTERS(depth)
+++#else
+++#define SLICED_LOOP_FILTERS(depth)\
+++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
+++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
+++#endif
+++
+++
++ #define HEVC_DSP(depth)                                                     \
++     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
++     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
++@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
++     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
++     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);                 \
+++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
+++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
+++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
+++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
+++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
+++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
+++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
+++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
++     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
++     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
++     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
++@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
++     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
++                                                                                \
+++    hevcdsp->sao_band_filter_c[0] =                                            \
+++    hevcdsp->sao_band_filter_c[1] =                                            \
+++    hevcdsp->sao_band_filter_c[2] =                                            \
+++    hevcdsp->sao_band_filter_c[3] =                                            \
+++    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
+++    hevcdsp->sao_edge_filter_c[0] =                                            \
+++    hevcdsp->sao_edge_filter_c[1] =                                            \
+++    hevcdsp->sao_edge_filter_c[2] =                                            \
+++    hevcdsp->sao_edge_filter_c[3] =                                            \
+++    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
+++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
+++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
+++                                                                               \
++     QPEL_FUNCS(depth);                                                         \
++     QPEL_UNI_FUNCS(depth);                                                     \
++     QPEL_BI_FUNCS(depth);                                                      \
++@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     EPEL_UNI_FUNCS(depth);                                                     \
++     EPEL_BI_FUNCS(depth);                                                      \
++                                                                                \
+++    SLICED_LOOP_FILTERS(depth);                                                \
++     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
++     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
++     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
++@@ -257,6 +404,8 @@ int i = 0;
+          break;
+      }
+  
+@@ -7057,10 +9068,10 @@ index 9d773d9..a6534a9 100644
+          ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+      if (ARCH_ARM)
+ diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
+-index 9f1f6dd..e221e54 100644
++index 9f1f6dd..639ecf1 100644
+ --- a/libavcodec/hevcdsp.h
+ +++ b/libavcodec/hevcdsp.h
+-@@ -42,6 +42,17 @@ typedef struct SAOParams {
++@@ -42,11 +42,26 @@ typedef struct SAOParams {
+      uint8_t type_idx[3];    ///< sao_type_idx
+  } SAOParams;
+  
+@@ -7078,21 +9089,742 @@ index 9f1f6dd..e221e54 100644
+  typedef struct HEVCDSPContext {
+      void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                      struct GetBitContext *gb, int pcm_bit_depth);
+-@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
+++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+++                    struct GetBitContext *gb, int pcm_bit_depth);
++ 
++-    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+++    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+++    void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+++    void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++ 
++     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
++ 
++@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext {
++ 
++     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                               int width, int height);
++ 
++     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+++    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++ 
++     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++ 
++     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                     int height, intptr_t mx, intptr_t my, int width);
++@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext {
+      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+                                          int32_t *tc, uint8_t *no_p,
+                                          uint8_t *no_q);
+++#ifdef RPI
+++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+++                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                                 const uint8_t no_p[2], const uint8_t no_q[2],
+++                                 uint8_t * _pix_l);
+++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                                 unsigned int no_f);
+++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                                 uint8_t * src_l,
+++                                 unsigned int no_f);
+++
+++#endif
+++
+ +    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+ +                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ +                                               MvField *curr, MvField *neigh, uint8_t *bs);
+  } HEVCDSPContext;
+  
+  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
++index b840d17..32b9e47 100644
++--- a/libavcodec/hevcdsp_template.c
+++++ b/libavcodec/hevcdsp_template.c
++@@ -26,6 +26,9 @@
++ #include "bit_depth_template.c"
++ #include "hevcdsp.h"
++ 
+++#ifdef RPI
+++#include "rpi_zc.h"
+++#endif
++ 
++ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                           GetBitContext *gb, int pcm_bit_depth)
++@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
++     }
++ }
++ 
+++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+++                          GetBitContext *gb, int pcm_bit_depth)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x++)
+++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+++        dst += stride;
+++    }
+++
+++    dst = (pixel *)_dst + 1;
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x++)
+++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+++        dst += stride;
+++    }
+++}
+++
+++
++ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
++                                                      ptrdiff_t stride, int size)
++ {
++@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
++     }
++ }
++ 
+++static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res,
+++                                                ptrdiff_t stride, int size)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x] = av_clip_pixel(dst[x] + *res);
+++            res++;
+++        }
+++        dst += stride;
+++    }
+++}
+++
++ static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
++                                        ptrdiff_t stride)
++ {
++@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
++     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
++ }
++ 
+++// -- U -- (plaited)
+++
+++static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst, res, stride, 4);
+++}
+++
+++static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst, res, stride, 8);
+++}
+++
+++static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst, res, stride, 16);
+++}
+++
+++static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst, res, stride, 32);
+++}
+++
+++// -- V -- (plaited)
+++
+++static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst + 1, res, stride, 4);
+++}
+++
+++static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst + 1, res, stride, 8);
+++}
+++
+++static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst + 1, res, stride, 16);
+++}
+++
+++static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst + 1, res, stride, 32);
+++}
+++
++ 
++ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++ {
++@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++     pixel *src = (pixel *)_src;
++-    int16_t *sao_offset_val = sao->offset_val[c_idx];
++     int sao_eo_class    = sao->eo_class[c_idx];
++     int init_x = 0, width = _width, height = _height;
++ 
++@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++ 
++     if (sao_eo_class != SAO_EO_VERT) {
++         if (borders[0]) {
++-            int offset_val = sao_offset_val[0];
++             for (y = 0; y < height; y++) {
++-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+++                dst[y * stride_dst] = src[y * stride_src];
++             }
++             init_x = 1;
++         }
++         if (borders[2]) {
++-            int offset_val = sao_offset_val[0];
++             int offset     = width - 1;
++             for (x = 0; x < height; x++) {
++-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++             }
++             width--;
++         }
++     }
++     if (sao_eo_class != SAO_EO_HORIZ) {
++         if (borders[1]) {
++-            int offset_val = sao_offset_val[0];
++             for (x = init_x; x < width; x++)
++-                dst[x] = av_clip_pixel(src[x] + offset_val);
+++                dst[x] = src[x];
++         }
++         if (borders[3]) {
++-            int offset_val   = sao_offset_val[0];
++-            int y_stride_dst = stride_dst * (height - 1);
++-            int y_stride_src = stride_src * (height - 1);
+++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++             for (x = init_x; x < width; x++)
++-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+++                dst[x + y_stride_dst] = src[x + y_stride_src];
++             height--;
++         }
++     }
++@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++     pixel *src = (pixel *)_src;
++-    int16_t *sao_offset_val = sao->offset_val[c_idx];
++     int sao_eo_class    = sao->eo_class[c_idx];
++     int init_x = 0, init_y = 0, width = _width, height = _height;
++ 
++@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ 
++     if (sao_eo_class != SAO_EO_VERT) {
++         if (borders[0]) {
++-            int offset_val = sao_offset_val[0];
++             for (y = 0; y < height; y++) {
++-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+++                dst[y * stride_dst] = src[y * stride_src];
++             }
++             init_x = 1;
++         }
++         if (borders[2]) {
++-            int offset_val = sao_offset_val[0];
++             int offset     = width - 1;
++             for (x = 0; x < height; x++) {
++-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++             }
++             width--;
++         }
++     }
++     if (sao_eo_class != SAO_EO_HORIZ) {
++         if (borders[1]) {
++-            int offset_val = sao_offset_val[0];
++             for (x = init_x; x < width; x++)
++-                dst[x] = av_clip_pixel(src[x] + offset_val);
+++                dst[x] = src[x];
++             init_y = 1;
++         }
++         if (borders[3]) {
++-            int offset_val   = sao_offset_val[0];
++-            int y_stride_dst = stride_dst * (height - 1);
++-            int y_stride_src = stride_src * (height - 1);
+++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++             for (x = init_x; x < width; x++)
++-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+++                dst[x + y_stride_dst] = src[x + y_stride_src];
++             height--;
++         }
++     }
++@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++     }
++ }
++ 
+++
+++// --- Plaited chroma versions
+++
+++#if BIT_DEPTH != 8
+++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+++{
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++#else
+++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+++{
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int offset_table_u[32] = { 0 };
+++    int offset_table_v[32] = { 0 };
+++    int k, y, x;
+++    int shift  = BIT_DEPTH - 5;
+++
+++    stride_dst /= sizeof(pixel);
+++    stride_src /= sizeof(pixel);
+++    width *= 2;
+++
+++    for (k = 0; k < 4; k++)
+++    {
+++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+++    }
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x += 2)
+++        {
+++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
+++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
+++        }
+++        dst += stride_dst;
+++        src += stride_src;
+++    }
+++}
+++#endif
+++
+++#if BIT_DEPTH != 8
+++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+++                                  int eo, int width, int height) {
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++#else
+++
+++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+++                                  int eo, int width, int height) {
+++
+++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+++    static const int8_t pos[4][2][2] = {
+++        { { -1,  0 }, {  1, 0 } }, // horizontal
+++        { {  0, -1 }, {  0, 1 } }, // vertical
+++        { { -1, -1 }, {  1, 1 } }, // 45 degree
+++        { {  1, -1 }, { -1, 1 } }, // 135 degree
+++    };
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int a_stride, b_stride;
+++    int x, y;
+++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+++    stride_dst /= sizeof(pixel);
+++    width *= 2;
+++
+++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x += 2) {
+++            int diff0u = CMP(src[x], src[x + a_stride]);
+++            int diff1u = CMP(src[x], src[x + b_stride]);
+++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
+++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
+++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+++        }
+++        src += stride_src;
+++        dst += stride_dst;
+++    }
+++}
+++#endif
+++
+++#if BIT_DEPTH != 8
+++static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
+++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+++                                    int *borders, int _width, int _height,
+++                                    int c_idx, uint8_t *vert_edge,
+++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+++{
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
+++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+++                                    int *borders, int _width, int _height,
+++                                    int c_idx, uint8_t *vert_edge,
+++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+++{
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++#else
+++// Any old 2 byte 'normal' restore will work for these
+++#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
+++#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
+++#endif
+++
+++
++ #undef CMP
++ 
++ ////////////////////////////////////////////////////////////////////////////////
++@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ #undef TQ1
++ #undef TQ2
++ #undef TQ3
+++
+++#ifdef RPI
+++
+++// line zero
+++#define P3 pix_l[0 * xstride]
+++#define P2 pix_l[1 * xstride]
+++#define P1 pix_l[2 * xstride]
+++#define P0 pix_l[3 * xstride]
+++#define Q0 pix_r[0 * xstride]
+++#define Q1 pix_r[1 * xstride]
+++#define Q2 pix_r[2 * xstride]
+++#define Q3 pix_r[3 * xstride]
+++
+++// line three. used only for deblocking decision
+++#define TP3 pix_l[0 * xstride + 3 * ystride]
+++#define TP2 pix_l[1 * xstride + 3 * ystride]
+++#define TP1 pix_l[2 * xstride + 3 * ystride]
+++#define TP0 pix_l[3 * xstride + 3 * ystride]
+++#define TQ0 pix_r[0 * xstride + 3 * ystride]
+++#define TQ1 pix_r[1 * xstride + 3 * ystride]
+++#define TQ2 pix_r[2 * xstride + 3 * ystride]
+++#define TQ3 pix_r[3 * xstride + 3 * ystride]
+++
+++// This is identical to hevc_loop_filter_luma except that the P/Q
+++// components are on separate pointers
+++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+++                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
+++                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
+++                                 uint8_t * _pix_l)
+++{
+++    int d, j;
+++    pixel *pix_l        = (pixel *)_pix_l;
+++    pixel *pix_r        = (pixel *)_pix_r;
+++    const ptrdiff_t xstride = 1;
+++    const ptrdiff_t ystride = _stride / sizeof(pixel);
+++
+++    beta <<= BIT_DEPTH - 8;
+++
+++    for (j = 0; j < 2; j++) {
+++        const int dp0  = abs(P2  - 2 * P1  + P0);
+++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+++        const int d0   = dp0 + dq0;
+++        const int d3   = dp3 + dq3;
+++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
+++        const int no_p = _no_p[j];
+++        const int no_q = _no_q[j];
+++
+++        if (d0 + d3 >= beta) {
+++            pix_l += 4 * ystride;
+++            pix_r += 4 * ystride;
+++            continue;
+++        } else {
+++            const int beta_3 = beta >> 3;
+++            const int beta_2 = beta >> 2;
+++            const int tc25   = ((tc * 5 + 1) >> 1);
+++
+++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+++                // strong filtering
+++                const int tc2 = tc << 1;
+++                for (d = 0; d < 4; d++) {
+++                    const int p3 = P3;
+++                    const int p2 = P2;
+++                    const int p1 = P1;
+++                    const int p0 = P0;
+++                    const int q0 = Q0;
+++                    const int q1 = Q1;
+++                    const int q2 = Q2;
+++                    const int q3 = Q3;
+++                    if (!no_p) {
+++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+++                    }
+++                    if (!no_q) {
+++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+++                    }
+++                    pix_l += ystride;
+++                    pix_r += ystride;
+++                }
+++            } else { // normal filtering
+++                int nd_p = 1;
+++                int nd_q = 1;
+++                const int tc_2 = tc >> 1;
+++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+++                    nd_p = 2;
+++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+++                    nd_q = 2;
+++
+++                for (d = 0; d < 4; d++) {
+++                    const int p2 = P2;
+++                    const int p1 = P1;
+++                    const int p0 = P0;
+++                    const int q0 = Q0;
+++                    const int q1 = Q1;
+++                    const int q2 = Q2;
+++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+++                    if (abs(delta0) < 10 * tc) {
+++                        delta0 = av_clip(delta0, -tc, tc);
+++                        if (!no_p)
+++                            P0 = av_clip_pixel(p0 + delta0);
+++                        if (!no_q)
+++                            Q0 = av_clip_pixel(q0 - delta0);
+++                        if (!no_p && nd_p > 1) {
+++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+++                            P1 = av_clip_pixel(p1 + deltap1);
+++                        }
+++                        if (!no_q && nd_q > 1) {
+++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+++                            Q1 = av_clip_pixel(q1 + deltaq1);
+++                        }
+++                    }
+++                    pix_l += ystride;
+++                    pix_r += ystride;
+++                }
+++            }
+++        }
+++    }
+++}
+++
+++#undef TP3
+++#undef TP2
+++#undef TP1
+++#undef TP0
+++#undef TQ0
+++#undef TQ1
+++#undef TQ2
+++#undef TQ3
+++
+++#undef P3
+++#undef P2
+++#undef P1
+++#undef P0
+++#undef Q0
+++#undef Q1
+++#undef Q2
+++#undef Q3
+++
+++#define P1 pix_l[0 * xstride]
+++#define P0 pix_l[1 * xstride]
+++#define Q0 pix_r[0 * xstride]
+++#define Q1 pix_r[1 * xstride]
+++
+++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+++                                          ptrdiff_t _ystride, const int32_t *_tc,
+++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+++{
+++    int d, j, no_p, no_q;
+++    pixel *pix_l        = (pixel *)_pix_l;
+++    pixel *pix_r        = (pixel *)_pix_r;
+++    ptrdiff_t xstride = _xstride / sizeof(pixel);
+++    ptrdiff_t ystride = _ystride / sizeof(pixel);
+++
+++    for (j = 0; j < 2; j++) {
+++        const int tc = _tc[j] << (BIT_DEPTH - 8);
+++        if (tc <= 0) {
+++            pix_l += 4 * ystride;
+++            pix_r += 4 * ystride;
+++            continue;
+++        }
+++        no_p = _no_p[j];
+++        no_q = _no_q[j];
+++
+++        for (d = 0; d < 4; d++) {
+++            int delta0;
+++            const int p1 = P1;
+++            const int p0 = P0;
+++            const int q0 = Q0;
+++            const int q1 = Q1;
+++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+++            if (!no_p)
+++                P0 = av_clip_pixel(p0 + delta0);
+++            if (!no_q)
+++                Q0 = av_clip_pixel(q0 - delta0);
+++            pix_l += ystride;
+++            pix_r += ystride;
+++        }
+++    }
+++}
+++
+++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+++                                 unsigned int no_f)
+++{
+++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+++}
+++
+++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                                 uint8_t * src_l,
+++                                 unsigned int no_f)
+++{
+++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+++}
+++
+++#undef P1
+++#undef P0
+++#undef Q0
+++#undef Q1
+++
+++
+++#endif
+++
++diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
++index 02c1766..cea16ea 100644
++--- a/libavcodec/hevcpred.c
+++++ b/libavcodec/hevcpred.c
++@@ -24,6 +24,7 @@
++ 
++ #include "hevcpred.h"
++ 
+++#define PRED_C 0
++ #define BIT_DEPTH 8
++ #include "hevcpred_template.c"
++ #undef BIT_DEPTH
++@@ -39,13 +40,37 @@
++ #define BIT_DEPTH 12
++ #include "hevcpred_template.c"
++ #undef BIT_DEPTH
+++#undef PRED_C
+++
+++#ifdef RPI
+++#define PRED_C 1
+++#define BIT_DEPTH 8
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 9
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 10
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 12
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++#undef PRED_C
+++#endif
++ 
++ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
++ {
++ #undef FUNC
++ #define FUNC(a, depth) a ## _ ## depth
++ 
++-#define HEVC_PRED(depth)                                \
+++#undef FUNCC
+++#define FUNCC(a, depth) a ## _ ## depth ## _c
+++
+++#define HEVC_PRED_Y(depth)                                \
++     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
++     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
++     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
++@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
++     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
++ 
+++#define HEVC_PRED_C(depth)                                \
+++    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
+++    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
+++    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
+++    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
+++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
+++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
+++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
+++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
+++    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
+++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+++
+++#ifdef RPI
+++#define HEVC_PRED(depth) \
+++    HEVC_PRED_Y(depth); \
+++    HEVC_PRED_C(depth);
+++#else
+++#define HEVC_PRED(depth) \
+++    HEVC_PRED_Y(depth);
+++#endif
+++
++     switch (bit_depth) {
++     case 9:
++         HEVC_PRED(9);
++diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
++index eb17663..00ba3f9 100644
++--- a/libavcodec/hevcpred.h
+++++ b/libavcodec/hevcpred.h
++@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
++     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++                             const uint8_t *left, ptrdiff_t stride,
++                             int c_idx, int mode);
+++#ifdef RPI
+++    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+++
+++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+++                           const uint8_t *left, ptrdiff_t stride);
+++    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+++                    ptrdiff_t stride, int log2_size, int c_idx);
+++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+++                            const uint8_t *left, ptrdiff_t stride,
+++                            int c_idx, int mode);
+++#endif
++ } HEVCPredContext;
++ 
++ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+ diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 6ae87cc..28d2653 100644
++index 6ae87cc..c14dddd 100644
+ --- a/libavcodec/hevcpred_template.c
+ +++ b/libavcodec/hevcpred_template.c
+-@@ -20,6 +20,8 @@
++@@ -20,13 +20,55 @@
+   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+   */
+  
+@@ -7101,7 +9833,54 @@ index 6ae87cc..28d2653 100644
+  #include "libavutil/pixdesc.h"
+  
+  #include "bit_depth_template.c"
+-@@ -69,8 +71,11 @@ do {                                  \
++ #include "hevcpred.h"
++ 
+++#ifdef RPI
+++#include "rpi_zc.h"
+++#endif
+++
+++#define DUMP_PRED 0
+++
++ #define POS(x, y) src[(x) + stride * (y)]
++ 
+++#if PRED_C
+++
+++typedef uint8_t (* c8_dst_ptr_t)[2];
+++typedef const uint8_t (* c8_src_ptr_t)[2];
+++
+++#if BIT_DEPTH == 8
+++#undef BIT_DEPTH
+++#define BIT_DEPTH 16
+++#include "bit_depth_template.c"
+++#undef FUNC
+++#define FUNC(a) FUNC3(a, 8, _c)
+++#else
+++#undef FUNC
+++#define FUNC FUNCC
+++#endif
+++
+++#endif
+++
+++#if DUMP_PRED
+++#ifndef DEBUG_ONCE
+++#define DEBUG_ONCE
+++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+++{
+++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+++        for (unsigned int x = 0; x != size; x++) {
+++            printf("%4d", data[x * 2]);
+++        }
+++        printf("\n");
+++    }
+++    printf("\n");
+++}
+++#endif
+++#endif
+++
++ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
++                                               int log2_size, int c_idx)
++ {
++@@ -69,8 +111,11 @@ do {                                  \
+                  AV_WN4P(&ptr[i], a);                                           \
+              else                                                               \
+                  a = PIXEL_SPLAT_X4(ptr[i + 3])
+@@ -7114,17 +9893,399 @@ index 6ae87cc..28d2653 100644
+      int i;
+      int hshift = s->ps.sps->hshift[c_idx];
+      int vshift = s->ps.sps->vshift[c_idx];
+-@@ -114,6 +119,10 @@ do {                                  \
++@@ -79,15 +124,23 @@ do {                                  \
++     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
++     int size_in_luma_v = size << vshift;
++     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
++-    int x = x0 >> hshift;
++-    int y = y0 >> vshift;
+++    const int x = x0 >> hshift;
+++    const int y = y0 >> vshift;
++     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++ 
++     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
++ 
++-    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+++    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+++#if defined(RPI)
+++    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
+++            (pixel*)s->frame->data[c_idx] + x + y * stride :
+++        c_idx == 0 ?
+++            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
+++            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
+++#else
++     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+++#endif
++ 
++     int min_pu_width = s->ps.sps->min_pu_width;
++ 
++@@ -95,14 +148,20 @@ do {                                  \
++                               lc->tu.intra_pred_mode;
++     pixel4 a;
++     pixel  left_array[2 * MAX_TB_SIZE + 1];
+++#if !PRED_C
++     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+++#endif
++     pixel  top_array[2 * MAX_TB_SIZE + 1];
+++#if !PRED_C
++     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+++#endif
++ 
++     pixel  *left          = left_array + 1;
++     pixel  *top           = top_array  + 1;
+++#if !PRED_C
++     pixel  *filtered_left = filtered_left_array + 1;
++     pixel  *filtered_top  = filtered_top_array  + 1;
+++#endif
++     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
++     int cand_left        = lc->na.cand_left;
++     int cand_up_left     = lc->na.cand_up_left;
++@@ -114,6 +173,26 @@ do {                                  \
+      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+                             (x0 + size_in_luma_h)) >> hshift;
+  
+++    pixel * src_l = src - 1;
+++    pixel * src_u = src - stride;
+++    pixel * src_ur = src_u + size;
+++
+ +#ifdef DISABLE_INTRA
+ +    return;
+ +#endif
+ +
+++#if defined(RPI)
+++    if (s->frame->format == AV_PIX_FMT_SAND128) {
+++        const AVFrame * const frame = s->frame;
+++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+++        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
+++        if ((x & mask) == 0)
+++            src_l -= stripe_adj;
+++        if (((x + size) & mask) == 0)
+++            src_ur += stripe_adj;
+++    }
+++#endif
+++
+      if (s->ps.pps->constrained_intra_pred_flag == 1) {
+          int size_in_luma_pu_v = PU(size_in_luma_v);
+          int size_in_luma_pu_h = PU(size_in_luma_h);
++@@ -163,23 +242,24 @@ do {                                  \
++         top[-1] = 128;
++     }
++     if (cand_up_left) {
++-        left[-1] = POS(-1, -1);
+++        left[-1] = src_l[-stride];
++         top[-1]  = left[-1];
++     }
++     if (cand_up)
++-        memcpy(top, src - stride, size * sizeof(pixel));
+++        // Always good - even with sand
+++        memcpy(top, src_u, size * sizeof(pixel));
++     if (cand_up_right) {
++-        memcpy(top + size, src - stride + size, size * sizeof(pixel));
++-        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
+++        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
+++        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
++                size - top_right_size);
++     }
++     if (cand_left)
++         for (i = 0; i < size; i++)
++-            left[i] = POS(-1, i);
+++            left[i] = src_l[stride * i];
++     if (cand_bottom_left) {
++         for (i = size; i < size + bottom_left_size; i++)
++-            left[i] = POS(-1, i);
++-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+++            left[i] = src_l[stride * i];
+++        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
++                size - bottom_left_size);
++     }
++ 
++@@ -268,7 +348,11 @@ do {                                  \
++             cand_up_left = 1;
++             cand_left    = 1;
++         } else { // No samples available
+++#if PRED_C && BIT_DEPTH == 16
+++            left[-1] = 0x8080;
+++#else
++             left[-1] = (1 << (BIT_DEPTH - 1));
+++#endif
++             EXTEND(top,  left[-1], 2 * size);
++             EXTEND(left, left[-1], 2 * size);
++         }
++@@ -287,6 +371,9 @@ do {                                  \
++     top[-1] = left[-1];
++ 
++     // Filtering process
+++    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
+++    // worry about chroma smoothing for that case
+++#if !PRED_C
++     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
++         if (mode != INTRA_DC && size != 4){
++             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
++@@ -342,13 +429,46 @@ do {                                  \
++                                            mode);
++         break;
++     }
+++#else
+++    switch (mode) {
+++    case INTRA_PLANAR:
+++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+++                                          (uint8_t *)left, stride);
+++        break;
+++    case INTRA_DC:
+++        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
+++                       (uint8_t *)left, stride, log2_size, c_idx);
+++        break;
+++    default:
+++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+++                                           (uint8_t *)left, stride, c_idx,
+++                                           mode);
+++        break;
+++    }
+++
+++#if DUMP_PRED
+++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+++#endif
+++#endif
++ }
++ 
+++#if !PRED_C || BIT_DEPTH == 16
++ #define INTRA_PRED(size)                                                            \
++ static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
++ {                                                                                   \
++     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
++ }
+++#else
+++#define INTRA_PRED(size)                                                            \
+++static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+++{                                                                                   \
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++#endif
++ 
++ INTRA_PRED(2)
++ INTRA_PRED(3)
++@@ -357,6 +477,7 @@ INTRA_PRED(5)
++ 
++ #undef INTRA_PRED
++ 
+++#if !PRED_C
++ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++                                   const uint8_t *_left, ptrdiff_t stride,
++                                   int trafo_size)
++@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
++             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
++                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
++ }
+++#else
+++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+++                                  const uint8_t * _left, ptrdiff_t stride,
+++                                  int trafo_size)
+++{
+++    int x, y;
+++    int size = 1 << trafo_size;
+++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
+++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
+++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
+++
+++    for (y = 0; y < size; y++, src += stride)
+++    {
+++        for (x = 0; x < size; x++)
+++        {
+++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
+++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
+++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+++        }
+++    }
+++}
+++#endif
++ 
+++#if !PRED_C || BIT_DEPTH == 16
++ #define PRED_PLANAR(size)\
++ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                        const uint8_t *left, ptrdiff_t stride)   \
++ {                                                                               \
++     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
++ }
+++#else
+++#define PRED_PLANAR(size)\
+++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+++                                       const uint8_t *left, ptrdiff_t stride)   \
+++{                                                                               \
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
+++    abort();                                                                    \
+++}
+++#endif
++ 
++ PRED_PLANAR(0)
++ PRED_PLANAR(1)
++@@ -386,6 +540,7 @@ PRED_PLANAR(3)
++ 
++ #undef PRED_PLANAR
++ 
+++#if !PRED_C
++ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                           const uint8_t *_left,
++                           ptrdiff_t stride, int log2_size, int c_idx)
++@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++     }
++ }
+++#else
+++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+++                          const uint8_t *_left,
+++                          ptrdiff_t stride, int log2_size, int c_idx)
+++{
+++    unsigned int i, j;
+++    const unsigned int size = (1 << log2_size);
+++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
+++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
+++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
+++    unsigned int dc0 = size;
+++    unsigned int dc1 = size;
+++
+++    for (i = 0; i < size; i++)
+++    {
+++        dc0 += left[i][0] + top[i][0];
+++        dc1 += left[i][1] + top[i][1];
+++    }
+++
+++    dc0 >>= log2_size + 1;
+++    dc1 >>= log2_size + 1;
+++
+++    for (i = 0; i < size; i++, src += stride)
+++    {
+++        for (j = 0; j < size; ++j)
+++        {
+++            src[j][0] = dc0;
+++            src[j][1] = dc1;
++ 
+++        }
+++    }
+++}
+++#endif
+++
+++#ifndef ANGLE_CONSTS
+++#define ANGLE_CONSTS
+++static const int intra_pred_angle[] = {
+++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+++};
+++static const int inv_angle[] = {
+++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+++    -630, -910, -1638, -4096
+++};
+++#endif
+++
+++#if !PRED_C
++ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                 const uint8_t *_top,
++                                                 const uint8_t *_left,
++@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++     const pixel *top  = (const pixel *)_top;
++     const pixel *left = (const pixel *)_left;
++ 
++-    static const int intra_pred_angle[] = {
++-         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++-        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++-    };
++-    static const int inv_angle[] = {
++-        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++-        -630, -910, -1638, -4096
++-    };
++-
++     int angle = intra_pred_angle[mode - 2];
++     pixel ref_array[3 * MAX_TB_SIZE + 4];
++     pixel *ref_tmp = ref_array + size;
++@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++         }
++     }
++ }
+++#else
+++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+++                                                const uint8_t *_top,
+++                                                const uint8_t *_left,
+++                                                ptrdiff_t stride, int c_idx,
+++                                                int mode, int size)
+++{
+++    int x, y;
+++    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
+++    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
+++    c8_src_ptr_t left = (c8_src_ptr_t)_left;
+++
+++    const int angle = intra_pred_angle[mode - 2];
+++    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
+++    c8_dst_ptr_t ref_tmp = ref_array + size;
+++    c8_src_ptr_t ref;
+++    const int last = (size * angle) >> 5;
+++
+++    if (mode >= 18) {
+++        ref = top - 1;
+++        if (angle < 0 && last < -1) {
+++            memcpy(ref_tmp, top - 1, (size + 1) * 2);
+++            for (x = last; x <= -1; x++)
+++            {
+++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+++            }
+++            ref = (c8_src_ptr_t)ref_tmp;
+++        }
+++
+++        for (y = 0; y < size; y++, src += stride) {
+++            const int idx  = ((y + 1) * angle) >> 5;
+++            const int fact = ((y + 1) * angle) & 31;
+++            if (fact) {
+++                for (x = 0; x < size; ++x) {
+++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
+++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
+++                }
+++            } else {
+++                memcpy(src, ref + idx + 1, size * 2);
+++            }
+++        }
+++    } else {
+++        ref = left - 1;
+++        if (angle < 0 && last < -1) {
+++            memcpy(ref_tmp, left - 1, (size + 1) * 2);
+++            for (x = last; x <= -1; x++)
+++            {
+++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+++            }
+++            ref = (c8_src_ptr_t)ref_tmp;
+++        }
+++
+++        for (x = 0; x < size; x++, src++) {
+++            const int idx  = ((x + 1) * angle) >> 5;
+++            const int fact = ((x + 1) * angle) & 31;
+++            if (fact) {
+++                for (y = 0; y < size; y++) {
+++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
+++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
+++                }
+++            } else {
+++                for (y = 0; y < size; y++)
+++                {
+++                    src[y * stride][0] = ref[y + idx + 1][0];
+++                    src[y * stride][1] = ref[y + idx + 1][1];
+++                }
+++            }
+++        }
+++    }
+++}
+++#endif
++ 
++ static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++                                  const uint8_t *left,
+ diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+ index 099a8c5..bdff2d2 100644
+ --- a/libavcodec/mmaldec.c
+@@ -7169,6 +10330,87 @@ index 3adf28d..2f9195f 100644
+      if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+          s->codec_id == AV_CODEC_ID_MPEG4 &&
+          avctx->idct_algo == FF_IDCT_AUTO) {
++diff --git a/libavcodec/raw.c b/libavcodec/raw.c
++index bfa2537..1bca89e 100644
++--- a/libavcodec/raw.c
+++++ b/libavcodec/raw.c
++@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
++     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
++     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
++ 
+++    /* RPI */
+++#ifdef RPI
+++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+++#endif
+++
++     /* special */
++     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
++     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
++diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
++index d837056..81256b5 100644
++--- a/libavcodec/rawenc.c
+++++ b/libavcodec/rawenc.c
++@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
++     return 0;
++ }
++ 
+++static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off)
+++{
+++    for (int y = 0; y != frame->height / 2; ++y) {
+++        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+++            const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off;
+++            const int w = FFMIN(frame->linesize[0], frame->width - x) / 2;
+++            for (int i = 0; i < w; ++i)
+++                *dst++ = p[i * 2];
+++        }
+++    }
+++    return dst;
+++}
+++
+++static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+++                      const AVFrame *frame)
+++{
+++    int size = frame->width * frame->height * 3 / 2;
+++    uint8_t * dst;
+++    int ret;
+++
+++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+++        return ret;
+++
+++    dst = pkt->data;
+++
+++    // Luma is "easy"
+++    for (int y = 0; y != frame->height; ++y) {
+++        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+++            const int w = FFMIN(frame->linesize[0], frame->width - x);
+++            memcpy(dst,
+++                frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w);
+++            dst += w;
+++        }
+++    }
+++    // Chroma is dull
+++    dst = cpy_sand_c(dst, frame, 0);
+++    dst = cpy_sand_c(dst, frame, 1);
+++
+++    return 0;
+++}
+++
++ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
++                       const AVFrame *frame, int *got_packet)
++ {
++@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
++     if (ret < 0)
++         return ret;
++ 
+++    if (frame->format == AV_PIX_FMT_SAND128) {
+++        ret = raw_sand_as_yuv420(avctx, pkt, frame);
+++        *got_packet = (ret == 0);
+++        return ret;
+++    }
+++
++     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
++         return ret;
++     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+ diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+ new file mode 100644
+ index 0000000..4309f1c
+@@ -11170,10 +14412,10 @@ index 0000000..5543093
+ +  pop r6-r7, pc
+ diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+ new file mode 100644
+-index 0000000..3904efc
++index 0000000..0255f5d
+ --- /dev/null
+ +++ b/libavcodec/rpi_mailbox.c
+-@@ -0,0 +1,340 @@
++@@ -0,0 +1,149 @@
+ +/*
+ +Copyright (c) 2012, Broadcom Europe Ltd.
+ +All rights reserved.
+@@ -11201,6 +14443,8 @@ index 0000000..3904efc
+ +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ +*/
+ +
+++#ifdef RPI
+++
+ +#include <stdio.h>
+ +#include <string.h>
+ +#include <stdlib.h>
+@@ -11208,7 +14452,6 @@ index 0000000..3904efc
+ +#include <unistd.h>
+ +#include <assert.h>
+ +#include <stdint.h>
+-+#include <sys/mman.h>
+ +#include <sys/ioctl.h>
+ +
+ +#include <linux/ioctl.h>
+@@ -11218,137 +14461,29 @@ index 0000000..3904efc
+ +#define DEVICE_FILE_NAME "/dev/vcio"
+ +
+ +#include "rpi_mailbox.h"
+++//#include <interface/vctypes/vc_image_structs.h>
+ +
+-+#define PAGE_SIZE (4*1024)
+-+
+-+// Shared memory will not be cached in ARM cache
+-+void *mapmem_shared(unsigned base, unsigned size)
+-+{
+-+   int mem_fd;
+-+   unsigned offset = base % PAGE_SIZE;
+-+   base = base - offset;
+-+   /* open /dev/mem */
+-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+-+      return NULL;
+-+   }
+-+   void *mem = mmap(
+-+      0,
+-+      size,
+-+      PROT_READ|PROT_WRITE,
+-+      MAP_SHARED/*|MAP_FIXED*/,
+-+      mem_fd,
+-+      base);
+-+#ifdef DEBUG
+-+   printf("base=0x%x, mem=%p\n", base, mem);
+-+#endif
+-+   if (mem == MAP_FAILED) {
+-+      printf("mmap error %d\n", (int)mem);
+-+      return NULL;
+-+   }
+-+   close(mem_fd);
+-+   return (char *)mem + offset;
+-+}
+-+
+-+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
+-+void *mapmem_private(unsigned base, unsigned size)
+-+{
+-+   int mem_fd;
+-+   unsigned offset = base % PAGE_SIZE;
+-+   base = base - offset;
+-+   /* open /dev/mem */
+-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+-+      return NULL;
+-+   }
+-+   void *mem = mmap(
+-+      0,
+-+      size,
+-+      PROT_READ|PROT_WRITE,
+-+      MAP_PRIVATE/*|MAP_FIXED*/,
+-+      mem_fd,
+-+      base);
+-+#ifdef DEBUG
+-+   printf("base=0x%x, mem=%p\n", base, mem);
+-+#endif
+-+   if (mem == MAP_FAILED) {
+-+      printf("mmap error %d\n", (int)mem);
+-+      return NULL;
+-+   }
+-+   close(mem_fd);
+-+   return (char *)mem + offset;
+-+}
+-+
+-+void unmapmem(void *addr, unsigned size)
+-+{
+-+   int s = munmap(addr, size);
+-+   if (s != 0) {
+-+      printf("munmap error %d\n", s);
+-+      exit (-1);
+-+   }
+-+}
+-+
+-+/*
+-+ * use ioctl to send mbox property message
+-+ */
+-+
+-+static int mbox_property(int file_desc, void *buf)
+-+{
+-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+-+
+-+   if (ret_val < 0) {
+-+      printf("ioctl_set_msg failed:%d\n", ret_val);
+-+   }
+-+
+-+#ifdef DEBUG
+-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+-+   for (i=0; i<size/4; i++)
+-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+-+#endif
+-+   return ret_val;
+-+}
+-+
+-+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+
+-+   p[i++] = 0x3000c; // (the tag id)
+-+   p[i++] = 12; // (size of the buffer)
+-+   p[i++] = 12; // (size of the data)
+-+   p[i++] = size; // (num bytes? or pages?)
+-+   p[i++] = align; // (alignment)
+-+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+-+
+-+unsigned mem_free(int file_desc, unsigned handle)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+++/*
+++ * use ioctl to send mbox property message
+++ */
+ +
+-+   p[i++] = 0x3000f; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = handle;
+++static int mbox_property(int file_desc, void *buf)
+++{
+++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++   if (ret_val < 0) {
+++      printf("ioctl_set_msg failed:%d\n", ret_val);
+++   }
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+++#ifdef DEBUG
+++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+++   for (i=0; i<size/4; i++)
+++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+++#endif
+++   return ret_val;
+ +}
+ +
+-+unsigned mem_lock(int file_desc, unsigned handle)
+++unsigned mbox_mem_lock(int file_desc, unsigned handle)
+ +{
+ +   int i=0;
+ +   unsigned p[32];
+@@ -11367,7 +14502,7 @@ index 0000000..3904efc
+ +   return p[5];
+ +}
+ +
+-+unsigned mem_unlock(int file_desc, unsigned handle)
+++unsigned mbox_mem_unlock(int file_desc, unsigned handle)
+ +{
+ +   int i=0;
+ +   unsigned p[32];
+@@ -11386,117 +14521,30 @@ index 0000000..3904efc
+ +   return p[5];
+ +}
+ +
+-+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+
+-+   p[i++] = 0x30010; // (the tag id)
+-+   p[i++] = 28; // (size of the buffer)
+-+   p[i++] = 28; // (size of the data)
+-+   p[i++] = code;
+-+   p[i++] = r0;
+-+   p[i++] = r1;
+-+   p[i++] = r2;
+-+   p[i++] = r3;
+-+   p[i++] = r4;
+-+   p[i++] = r5;
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#define GET_VCIMAGE_PARAMS 0x30044
+ +
+-+unsigned qpu_enable(int file_desc, unsigned enable)
+++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
+ +{
+-+   int i=0;
+-+   unsigned p[32];
+-+
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+
+-+   p[i++] = 0x30012; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = enable;
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+-+
+-+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+-+   int i=0;
+-+   unsigned p[32];
+-+
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+   p[i++] = 0x30011; // (the tag id)
+-+   p[i++] = 16; // (size of the buffer)
+-+   p[i++] = 16; // (size of the data)
+-+   p[i++] = num_qpus;
+-+   p[i++] = control;
+-+   p[i++] = noflush;
+-+   p[i++] = timeout; // ms
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+-+
+-+void execute_multi(int file_desc,
+-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
+-+   int i=0;
+-+   unsigned p[32];
+++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
+++    uint32_t * p = buf;
+++    void * rimg;
+++    int rv;
+ +
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+   p[i++] = 0x30018; // (the tag id)
+-+   p[i++] = 88; // (size of the buffer)
+-+   p[i++] = 88; // (size of the data)
+-+
+-+   p[i++] = num_qpus;
+-+   p[i++] = control;
+-+   p[i++] = noflush;
+-+   p[i++] = timeout; // ms
+-+
+-+   p[i++] = num_qpus_2;
+-+   p[i++] = control_2;
+-+   p[i++] = noflush_2;
+-+   p[i++] = timeout_2; // ms
+-+
+-+   p[i++] = code;
+-+   p[i++] = r0;
+-+   p[i++] = r1;
+-+   p[i++] = r2;
+-+   p[i++] = r3;
+-+   p[i++] = r4;
+-+   p[i++] = r5;
+-+
+-+   p[i++] = code_2;
+-+   p[i++] = r0_2;
+-+   p[i++] = r1_2;
+-+   p[i++] = r2_2;
+-+   p[i++] = r3_2;
+-+   p[i++] = r4_2;
+-+   p[i++] = r5_2;
+++    *p++ = 0; // size
+++    *p++ = 0; // process request
+++    *p++ = GET_VCIMAGE_PARAMS;
+++    *p++ = sizeof(*img);
+++    *p++ = sizeof(*img);
+++    rimg = p;
+++    memcpy(p, img, sizeof(*img));
+++    p += sizeof(*img) / sizeof(*p);
+++    *p++ = 0;  // End tag
+++    buf[0] = (p - buf) * sizeof(*p);
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++    rv = mbox_property(fd, buf);
+++    memcpy(img, rimg, sizeof(*img));
+ +
+-+   mbox_property(file_desc, p);
+-+   return;
+++    return rv;
+ +}
+ +
+ +int mbox_open() {
+@@ -11514,55 +14562,80 @@ index 0000000..3904efc
+ +void mbox_close(int file_desc) {
+ +  close(file_desc);
+ +}
+++
+++#endif
+++
+ diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+ new file mode 100644
+-index 0000000..5898102
++index 0000000..b316878
+ --- /dev/null
+ +++ b/libavcodec/rpi_mailbox.h
+-@@ -0,0 +1,25 @@
++@@ -0,0 +1,58 @@
+ +#ifndef RPI_MAILBOX_H
+ +#define RPI_MAILBOX_H
+ +
+++/* The image structure. */
+++typedef struct vc_image_extra_uv_s {
+++  void *u, *v;
+++  int vpitch;
+++} VC_IMAGE_EXTRA_UV_T;
+++
+++typedef union {
+++    VC_IMAGE_EXTRA_UV_T uv;
+++//  VC_IMAGE_EXTRA_RGBA_T rgba;
+++//  VC_IMAGE_EXTRA_PAL_T pal;
+++//  VC_IMAGE_EXTRA_TF_T tf;
+++//  VC_IMAGE_EXTRA_BAYER_T bayer;
+++//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
+++//  VC_IMAGE_EXTRA_CODEC_T codec;
+++//  VC_IMAGE_EXTRA_OPENGL_T opengl;
+++} VC_IMAGE_EXTRA_T;
+++
+++
+++typedef struct VC_IMAGE_T {
+++  unsigned short                  type;           /* should restrict to 16 bits */
+++  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
+++  unsigned short                  width;          /* width in pixels */
+++  unsigned short                  height;         /* height in pixels */
+++  int                             pitch;          /* pitch of image_data array in bytes */
+++  int                             size;           /* number of bytes available in image_data array */
+++  void                           *image_data;     /* pixel data */
+++  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
+++  void                           *metadata;       /* metadata header for the image */
+++  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
+++  int                             mem_handle;     /* the mem handle for relocatable memory storage */
+++  int                             metadata_size;  /* size of metadata of each channel in bytes */
+++  int                             channel_offset; /* offset of consecutive channels in bytes */
+++  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
+++  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
+++  uint8_t                         current_channel;/* the channel this header is currently pointing to */
+++  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
+++  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
+++                                                            into a linked-mulitchannel image */
+++  uint8_t                         channel_index;         /* index of the channel this header represents while
+++                                                            it is being linked. */
+++  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
+++} VC_IMAGE_T;
+++
+++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+++
+++
+ +extern int mbox_open(void);
+ +extern void mbox_close(int file_desc);
+ +
+-+extern unsigned get_version(int file_desc);
+-+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+-+extern unsigned mem_free(int file_desc, unsigned handle);
+-+extern unsigned mem_lock(int file_desc, unsigned handle);
+-+extern unsigned mem_unlock(int file_desc, unsigned handle);
+-+extern void *mapmem_shared(unsigned base, unsigned size);
+-+extern void *mapmem_private(unsigned base, unsigned size);
+-+extern void unmapmem(void *addr, unsigned size);
+-+
+-+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+-+extern void execute_multi(int file_desc,
+-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
+-+extern unsigned qpu_enable(int file_desc, unsigned enable);
+++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
+++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+++
+++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+ +
+ +#endif
+ diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+ new file mode 100644
+-index 0000000..365f4a6
++index 0000000..7c0eedd
+ --- /dev/null
+ +++ b/libavcodec/rpi_qpu.c
+-@@ -0,0 +1,993 @@
++@@ -0,0 +1,902 @@
+ +#ifdef RPI
+-+// Use vchiq service for submitting jobs
+-+#define GPUSERVICE
+-+
+-+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+-+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+-+//#define RPI_TIME_TOTAL_QPU
+-+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+-+//#define RPI_TIME_TOTAL_VPU
+-+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+-+#define RPI_TIME_TOTAL_POSTED
+-+
+ +#include <stdio.h>
+ +#include <stdlib.h>
+ +#include <string.h>
+@@ -11575,27 +14648,35 @@ index 0000000..365f4a6
+ +#include <pthread.h>
+ +#include <time.h>
+ +
+++#include <interface/vcsm/user-vcsm.h>
+++
+ +#include "rpi_mailbox.h"
+ +#include "rpi_qpu.h"
+ +#include "rpi_shader.h"
+ +#include "rpi_hevc_transform.h"
+++#include "rpi_zc.h"
+ +
+-+#include "rpi_user_vcsm.h"
+-+#ifdef GPUSERVICE
+ +#pragma GCC diagnostic push
+ +// Many many redundant decls in the header files
+ +#pragma GCC diagnostic ignored "-Wredundant-decls"
+ +#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+ +#pragma GCC diagnostic pop
+-+#endif
+ +
+-+// QPU profile flags
+-+#define NO_FLUSH 1
+-+#define CLEAR_PROFILE 2
+-+#define OUTPUT_COUNTS 4
+++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
+++#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
+++
+++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
+++// Beware this is expensive and will probably throw off all other timing by >10%
+++#define RPI_TRACE_QPU_PROFILE_ALL       0
+ +
+-+#define FLAGS_FOR_PROFILING (NO_FLUSH)
+++// QPU "noflush" flags
+++// a mixture of flushing & profiling
+ +
+++#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
+++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
+++#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
+++#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
+++#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
+ +
+ +// On Pi2 there is no way to access the VPU L2 cache
+ +// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+@@ -11652,165 +14733,155 @@ index 0000000..365f4a6
+ +{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+ +};
+ +
+++// Code/constants on GPU
+ +struct GPU
+ +{
+ +  unsigned int qpu_code[QPU_CODE_SIZE];
+ +  unsigned int vpu_code[VPU_CODE_SIZE];
+ +  short transMatrix2even[16*16*2];
+-+  int open_count; // Number of allocated video buffers
+-+  int      mb; // Mailbox handle
+-+  int      vc; // Address in GPU memory
+-+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
+-+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+ +};
+ +
+-+// Stop more than one thread trying to allocate memory or use the processing resources at once
+-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+-+static volatile struct GPU* gpu = NULL;
+-+static GPU_MEM_PTR_T gpu_mem_ptr;
+-+
+-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
+-+static unsigned int Microseconds(void) {
+-+    struct timespec ts;
+-+    unsigned int x;
+-+    static unsigned int base = 0;
+-+    clock_gettime(CLOCK_REALTIME, &ts);
+-+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
+-+    if (base==0) base=x;
+-+    return x-base;
+-+}
+-+#endif
+++#define CFE_ENTS_PER_A 8
+++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
+++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
+++// allow 128
+++#define CFE_ENT_COUNT  128
+++#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
+ +
+-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
+-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
+++struct rpi_cache_flush_env_s {
+++    unsigned int n;
+++    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
+++};
+ +
+-+// Connect to QPU, returns 0 on success.
+-+static int gpu_init(volatile struct GPU **gpu) {
+-+  int mb = mbox_open();
+-+  int vc;
+-+  volatile struct GPU* ptr;
+-+	if (mb < 0)
+-+		return -1;
+-+#ifndef RPI_ASYNC
+-+	if (qpu_enable(mb, 1)) return -2;
+-+#endif
+-+  vcsm_init();
+-+  vc_gpuserv_init();
+-+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+-+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+-+  memset((void*)ptr, 0, sizeof *ptr);
+-+  vc = gpu_mem_ptr.vc;
+++#define WAIT_COUNT_MAX 16
+ +
+-+  ptr->mb = mb;
+-+  ptr->vc = vc;
+++typedef struct trace_time_one_s
+++{
+++  int count;
+++  int64_t start[WAIT_COUNT_MAX];
+++  int64_t total[WAIT_COUNT_MAX];
+++} trace_time_one_t;
+ +
+-+  printf("GPU allocated at 0x%x\n",vc);
+++typedef struct trace_time_wait_s
+++{
+++  unsigned int jcount;
+++  int64_t start0;
+++  int64_t last_update;
+++  trace_time_one_t active;
+++  trace_time_one_t wait;
+++} trace_time_wait_t;
+++
+++typedef struct vq_wait_s
+++{
+++  sem_t sem;
+++  unsigned int cost;
+++  struct vq_wait_s * next;
+++} vq_wait_t;
+ +
+-+  *gpu = ptr;
+++#define VQ_WAIT_POOL_SIZE 16
+++typedef struct vq_wait_pool_s
+++{
+++  vq_wait_t * head;
+++  vq_wait_t pool[VQ_WAIT_POOL_SIZE];
+++} vq_wait_pool_t;
+ +
+-+  // Now copy over the QPU code into GPU memory
+-+  {
+-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+-+    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+-+  }
+-+  // And the VPU code
+-+  {
+-+    int num_bytes = sizeof(rpi_hevc_transform);
+-+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-+  }
+-+  // And the transform coefficients
+-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
+++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
+ +
+-+#ifdef RPI_ASYNC
+-+  {
+-+    int err;
+-+    vpu_async_tail = 0;
+-+    vpu_async_head = 0;
+-+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+-+    //printf("Created thread\n");
+-+    if (err) {
+-+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
+-+        return -4;
+-+    }
+++typedef struct gpu_env_s
+++{
+++  int open_count;
+++  int init_count;
+++  int mb;
+++  unsigned int current_load;
+++  GPU_MEM_PTR_T code_gm_ptr;
+++  vq_wait_pool_t wait_pool;
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++  trace_time_wait_t ttw;
+++#endif
+++} gpu_env_t;
+ +
+-+    {
+-+      struct sched_param param = {0};
+-+      int policy = 0;
+++// Stop more than one thread trying to allocate memory or use the processing resources at once
+++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+++static gpu_env_t * gpu = NULL;
+ +
+-+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+-+      {
+-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+-+      }
+-+      else
+-+      {
+-+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
+-+            policy,
+-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+            param.sched_priority);
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ +
+-+        policy = SCHED_FIFO;
+-+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+++static int64_t ns_time(void)
+++{
+++    struct timespec ts;
+++    clock_gettime(CLOCK_MONOTONIC, &ts);
+++    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
+++}
+ +
+-+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
+-+            policy,
+-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+            param.sched_priority);
+ +
+-+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
+-+        {
+-+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
+-+        }
+-+        else
+-+        {
+-+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+-+          {
+-+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+-+          }
+-+          else
+-+          {
+-+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
+-+                policy,
+-+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+                param.sched_priority);
+-+          }
+-+        }
+-+      }
+++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
+ +
+-+    }
+++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
+++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
+++#define T_ARG(t) T_SEC(t), T_MS(t)
+++#define T_FMT "%u.%03u"
+ +
+++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
+++{
+++  // Update totals for levels that are still pending
+++  for (int i = 0; i < tto->count; ++i) {
+++    tto->total[i] += now - tto->start[i];
+++    tto->start[i] = now;
+ +  }
+-+#endif
+ +
+-+  return 0;
+++  printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
+++         prefix,
+++         T_ARG(now - start0 - tto->total[0]),
+++         T_ARG(tto->total[0]),
+++         T_ARG(tto->total[1]),
+++         T_ARG(tto->total[2]),
+++         T_ARG(tto->total[3]));
+ +}
+ +
+-+// Returns 1 if the gpu is currently idle
+-+static int gpu_idle(void)
+++
+++static void tto_start(trace_time_one_t * const tto, const int64_t now)
+ +{
+-+  int ret = pthread_mutex_trylock(&gpu_mutex);
+-+  if (ret==0) {
+-+    pthread_mutex_unlock(&gpu_mutex);
+-+    return 1;
+-+  }
+-+  return 0;
+++  av_assert0(tto->count < WAIT_COUNT_MAX);
+++  tto->start[tto->count++] = now;
+ +}
+ +
+-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+-+static void gpu_lock(void) {
+-+  pthread_mutex_lock(&gpu_mutex);
+++static void tto_end(trace_time_one_t * const tto, const int64_t now)
+++{
+++  const int n = --tto->count;
+++  av_assert0(n >= 0);
+++  tto->total[n] += now - tto->start[n];
+++}
+ +
+-+  if (gpu==NULL) {
+-+    gpu_init(&gpu);
+-+  }
+++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
+++{
+++  printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
+++  tto_print(&ttw->active, now, ttw->start0, "Active");
+++  tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
+ +}
+ +
+-+static void gpu_unlock(void) {
+-+  pthread_mutex_unlock(&gpu_mutex);
+++#endif
+++
+++// GPU memory alloc fns (internal)
+++
+++// GPU_MEM_PTR_T alloc fns
+++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+++  p->numbytes = numbytes;
+++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+++  av_assert0(p->vcsm_handle);
+++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+++  av_assert0(p->vc_handle);
+++  p->arm = vcsm_lock(p->vcsm_handle);
+++  av_assert0(p->arm);
+++  p->vc = mbox_mem_lock(mb, p->vc_handle);
+++  av_assert0(p->vc);
+++  return 0;
+ +}
+ +
+-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+ +  p->numbytes = numbytes;
+ +  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+ +  av_assert0(p->vcsm_handle);
+@@ -11818,90 +14889,143 @@ index 0000000..365f4a6
+ +  av_assert0(p->vc_handle);
+ +  p->arm = vcsm_lock(p->vcsm_handle);
+ +  av_assert0(p->arm);
+-+  p->vc = mem_lock(mb, p->vc_handle);
+++  p->vc = mbox_mem_lock(mb, p->vc_handle);
+ +  av_assert0(p->vc);
+ +  return 0;
+ +}
+ +
+-+// Allocate memory on GPU
+-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+-+// Returns 0 on success.
+-+// This allocates memory that will not be cached in ARM's data cache.
+-+// Therefore safe to use without data cache flushing.
+-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+-+{
+-+  int r;
+-+  gpu_lock();
+-+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
+-+  gpu->open_count++;
+-+  gpu_unlock();
+-+  return r;
+++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
+++  mbox_mem_unlock(mb, p->vc_handle);
+++  vcsm_unlock_ptr(p->arm);
+++  vcsm_free(p->vcsm_handle);
+++  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
+ +}
+ +
+-+int gpu_get_mailbox(void)
+++
+++// GPU init, free, lock, unlock
+++
+++static void gpu_term(void)
+ +{
+-+  av_assert0(gpu);
+-+  return gpu->mb;
+++  gpu_env_t * const ge = gpu;
+++
+++  // We have to hope that eveything has terminated...
+++  gpu = NULL;
+++
+++  vc_gpuserv_deinit();
+++
+++  gpu_free_internal(ge->mb, &ge->code_gm_ptr);
+++
+++  vcsm_exit();
+++
+++  mbox_close(ge->mb);
+++
+++  vq_wait_pool_deinit(&ge->wait_pool);
+++
+++  free(ge);
+ +}
+ +
+-+// Call this to clean and invalidate a region of memory
+-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
+++
+++// Connect to QPU, returns 0 on success.
+++static int gpu_init(gpu_env_t ** const gpu) {
+++  volatile struct GPU* ptr;
+++  gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
+++  *gpu = NULL;
+++
+++  if (ge == NULL)
+++    return -1;
+++
+++  if ((ge->mb = mbox_open()) < 0)
+++    return -1;
+++
+++  vq_wait_pool_init(&ge->wait_pool);
+++
+++  vcsm_init();
+++
+++  gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
+++  ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
+++
+++  // Zero everything so we have zeros between the code bits
+++  memset((void *)ptr, 0, sizeof(*ptr));
+++
+++  // Now copy over the QPU code into GPU memory
+++  {
+++    int num_bytes = (char *)mc_end - (char *)rpi_shader;
+++    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+++  }
+++  // And the VPU code
+++  {
+++    int num_bytes = sizeof(rpi_hevc_transform);
+++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+++  }
+++  // And the transform coefficients
+++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+++
+++  *gpu = ge;
+++  return 0;
+++}
+++
+++
+++
+++static void gpu_unlock(void) {
+++  pthread_mutex_unlock(&gpu_mutex);
+++}
+++
+++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+++static gpu_env_t * gpu_lock(void) {
+++  pthread_mutex_lock(&gpu_mutex);
+++
+++  av_assert0(gpu != NULL);
+++  return gpu;
+++}
+++
+++static gpu_env_t * gpu_lock_ref(void)
+ +{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int) p->arm;
+-+    iocache.s[0].size  = p->numbytes;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    void *tmp = vcsm_lock(p->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+#endif
+++  pthread_mutex_lock(&gpu_mutex);
+++
+++  if (gpu == NULL) {
+++    int rv = gpu_init(&gpu);
+++    if (rv != 0) {
+++      gpu_unlock();
+++      return NULL;
+++    }
+++  }
+++
+++  ++gpu->open_count;
+++  return gpu;
+ +}
+ +
+-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+++static void gpu_unlock_unref(gpu_env_t * const ge)
+ +{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    iocache.s[0].handle = p0->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int) p0->arm;
+-+    iocache.s[0].size  = p0->numbytes;
+-+    iocache.s[1].handle = p1->vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = (int) p1->arm;
+-+    iocache.s[1].size  = p1->numbytes;
+-+    iocache.s[2].handle = p2->vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = (int) p2->arm;
+-+    iocache.s[2].size  = p2->numbytes;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    void *tmp;
+-+    tmp = vcsm_lock(p0->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+    tmp = vcsm_lock(p1->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+    tmp = vcsm_lock(p2->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+#endif
+++  if (--ge->open_count == 0)
+++    gpu_term();
+++
+++  gpu_unlock();
+++}
+++
+++static inline gpu_env_t * gpu_ptr(void)
+++{
+++  av_assert0(gpu != NULL);
+++  return gpu;
+ +}
+ +
+-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-+  p->numbytes = numbytes;
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+-+  av_assert0(p->vcsm_handle);
+-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-+  av_assert0(p->vc_handle);
+-+  p->arm = vcsm_lock(p->vcsm_handle);
+-+  av_assert0(p->arm);
+-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  av_assert0(p->vc);
+-+  return 0;
+++// Public gpu fns
+++
+++// Allocate memory on GPU
+++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+++// Returns 0 on success.
+++// This allocates memory that will not be cached in ARM's data cache.
+++// Therefore safe to use without data cache flushing.
+++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+++{
+++  int r;
+++  gpu_env_t * const ge = gpu_lock_ref();
+++  if (ge == NULL)
+++    return -1;
+++  r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
+++  gpu_unlock();
+++  return r;
+ +}
+ +
+ +// This allocates data that will be
+@@ -11910,653 +15034,518 @@ index 0000000..365f4a6
+ +int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+ +{
+ +  int r;
+-+  gpu_lock();
+-+  r = gpu_malloc_cached_internal(numbytes, p);
+-+  gpu->open_count++;
+++  gpu_env_t * const ge = gpu_lock_ref();
+++  if (ge == NULL)
+++    return -1;
+++  r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
+ +  gpu_unlock();
+ +  return r;
+ +}
+ +
+-+static void gpu_term(void)
+-+{
+-+  int mb;
+-+
+-+  if (gpu==NULL)
+-+    return;
+-+  mb = gpu->mb;
+-+
+-+  // ??? Tear down anything needed for gpuexecute
+-+
+-+  qpu_enable(mb, 0);
+-+  gpu_free_internal(&gpu_mem_ptr);
+-+
+-+  vc_gpuserv_deinit();
+-+  vcsm_exit();
+-+
+-+  mbox_close(mb);
+-+  gpu = NULL;
+++void gpu_free(GPU_MEM_PTR_T * const p) {
+++  gpu_env_t * const ge = gpu_lock();
+++  gpu_free_internal(ge->mb, p);
+++  gpu_unlock_unref(ge);
+ +}
+ +
+-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
+-+  int mb = gpu->mb;
+-+  mem_unlock(mb,p->vc_handle);
+-+  vcsm_unlock_ptr(p->arm);
+-+  vcsm_free(p->vcsm_handle);
+++unsigned int vpu_get_fn(void) {
+++  // Make sure that the gpu is initialized
+++  av_assert0(gpu != NULL);
+++  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
+ +}
+ +
+-+void gpu_free(GPU_MEM_PTR_T *p) {
+-+  gpu_lock();
+-+
+-+  gpu_free_internal(p);
+-+
+-+  gpu->open_count--;
+-+  if (gpu->open_count==0) {
+-+      printf("Closing GPU\n");
+-+      gpu_term();
+-+      gpu = NULL;
+-+  }
+-+  gpu_unlock();
+++unsigned int vpu_get_constants(void) {
+++  av_assert0(gpu != NULL);
+++  return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
+ +}
+ +
+-+unsigned int vpu_get_fn(void) {
+-+  // Make sure that the gpu is initialized
+-+  if (gpu==NULL) {
+-+    printf("Preparing gpu\n");
+-+    gpu_lock();
+-+    gpu_unlock();
+-+  }
+-+  return gpu->vc + offsetof(struct GPU,vpu_code);
+++int gpu_get_mailbox(void)
+++{
+++  av_assert0(gpu);
+++  return gpu->mb;
+ +}
+ +
+-+unsigned int vpu_get_constants(void) {
+-+  if (gpu==NULL) {
+-+    gpu_lock();
+-+    gpu_unlock();
+-+  }
+-+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
+++void gpu_ref(void)
+++{
+++  gpu_lock_ref();
+++  gpu_unlock();
+ +}
+ +
+-+#ifdef GPUSERVICE
+-+static void callback(void *cookie)
+++void gpu_unref(void)
+ +{
+-+  sem_post((sem_t *)cookie);
+++  gpu_env_t * const ge = gpu_lock();
+++  gpu_unlock_unref(ge);
+ +}
+-+#endif
+ +
+++// ----------------------------------------------------------------------------
+++//
+++// Cache flush functions
+ +
+-+static volatile uint32_t post_done = 0;
+-+static volatile uint32_t post_qed = 0;
+ +
+-+static void post_code2_cb(void * v)
+++rpi_cache_flush_env_t * rpi_cache_flush_init()
+ +{
+-+  uint32_t n = (uint32_t)v;
+-+  if ((int32_t)(n - post_done) > 0) {
+-+    post_done = n;
+-+  }
+++    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
+++    if (rfe == NULL)
+++        return NULL;
+++
+++    rfe->n = 0;
+++    return rfe;
+ +}
+ +
+++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
+++{
+++    if (rfe != NULL)
+++        free(rfe);
+++}
+ +
+-+// Post a command to the queue
+-+// Returns an id which we can use to wait for completion
+-+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
+ +{
+-+  struct gpu_job_s j[1] = {
+++    int rc = 0;
+++    unsigned int na;
+++    unsigned int nr;
+++
+++    // Clear any reamaining ents in the final block
+++    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
+++        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
+++
+++    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
+ +    {
+-+      .command = EXECUTE_VPU,
+-+      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
+-+      .callback.func = post_code2_cb
+++        if (vcsm_clean_invalid(rfe->a + na) != 0)
+++            rc = -1;
+ +    }
+-+  };
+-+  uint32_t id;
+ +
+-+  j[0].callback.cookie = (void *)(id = ++post_qed);
+++    free(rfe);
+ +
+-+  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+++    if (rc == 0)
+++        return 0;
+ +
+-+  return id;
+++    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
+++    return rc;
+ +}
+ +
+-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+    int qpu0_n, const uint32_t * qpu0_mail,
+-+    int qpu1_n, const uint32_t * qpu1_mail)
+++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+ +{
+-+#if 1
+-+  sem_t sync0;
+-+  struct gpu_job_s j[4];
+-+
+-+  sem_init(&sync0, 0, 0);
+-+
+-+  j[0].command = EXECUTE_VPU;
+-+  j[0].u.v.q[0] = vpu_code;
+-+  j[0].u.v.q[1] = r0;
+-+  j[0].u.v.q[2] = r1;
+-+  j[0].u.v.q[3] = r2;
+-+  j[0].u.v.q[4] = r3;
+-+  j[0].u.v.q[5] = r4;
+-+  j[0].u.v.q[6] = r5;
+-+  j[0].callback.func = 0;
+-+  j[0].callback.cookie = NULL;
+-+
+-+  j[1].command = EXECUTE_QPU;
+-+  j[1].u.q.jobs = qpu1_n;
+-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+-+  j[1].u.q.timeout = 5000;
+-+  j[1].callback.func = 0;
+-+  j[1].callback.cookie = NULL;
+-+
+-+  j[2].command = EXECUTE_QPU;
+-+  j[2].u.q.jobs = qpu0_n;
+-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+-+  j[2].u.q.noflush = 1;
+-+  j[2].u.q.timeout = 5000;
+-+  j[2].callback.func = 0;
+-+  j[2].callback.cookie = NULL;
+-+
+-+  j[3].command = EXECUTE_SYNC;
+-+  j[3].u.s.mask = 3;
+-+  j[3].callback.func = callback;
+-+  j[3].callback.cookie = (void *)&sync0;
+-+
+-+  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
+-+
+-+  sem_wait(&sync0);
+-+#else
+++    // Deal with empty pointer trivially
+++    if (gm == NULL || gm->numbytes == 0)
+++        return;
+ +
+-+  sem_t sync0, sync2;
+-+  struct gpu_job_s j[3];
+-+
+-+  sem_init(&sync0, 0, 0);
+-+  sem_init(&sync2, 0, 0);
+-+
+-+  j[0].command = EXECUTE_VPU;
+-+  j[0].u.v.q[0] = vpu_code;
+-+  j[0].u.v.q[1] = r0;
+-+  j[0].u.v.q[2] = r1;
+-+  j[0].u.v.q[3] = r2;
+-+  j[0].u.v.q[4] = r3;
+-+  j[0].u.v.q[5] = r4;
+-+  j[0].u.v.q[6] = r5;
+-+  j[0].callback.func = callback;
+-+  j[0].callback.cookie = (void *)&sync0;
+-+
+-+  j[1].command = EXECUTE_QPU;
+-+  j[1].u.q.jobs = qpu1_n;
+-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+-+  j[1].u.q.timeout = 5000;
+-+  j[1].callback.func = 0;
+-+  j[1].callback.cookie = NULL;
+-+
+-+  j[2].command = EXECUTE_QPU;
+-+  j[2].u.q.jobs = qpu0_n;
+-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+-+  j[2].u.q.noflush = 1;
+-+  j[2].u.q.timeout = 5000;
+-+  j[2].callback.func = callback;
+-+  j[2].callback.cookie = (void *)&sync2;
+-+
+-+  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
+-+
+-+  sem_wait(&sync0);
+-+  sem_wait(&sync2);
+-+#endif
+++    {
+++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
+++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
+ +
+-+  return 0;
+-+}
+++        av_assert0(rfe->n < CFE_ENT_COUNT);
+ +
+++        a->s[n].cmd = mode;
+++        a->s[n].handle = gm->vcsm_handle;
+++        a->s[n].addr = (unsigned int)gm->arm;
+++        a->s[n].size = gm->numbytes;
+++        ++rfe->n;
+++    }
+++}
+ +
+-+// Wait for completion of the given command
+-+void vpu_wait(int id)
+++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+++  const unsigned int offset, const unsigned int size)
+ +{
+-+  if (id == 0) {
+-+#if 0
+-+    sem_t sync0;
+-+    struct gpu_job_s j[1] =
+++    // Deal with empty pointer trivially
+++    if (gm == NULL || size == 0)
+++        return;
+++
+++//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
+++
+++    av_assert0(offset <= gm->numbytes);
+++    av_assert0(size <= gm->numbytes);
+++    av_assert0(offset + size <= gm->numbytes);
+++
+ +    {
+-+      {
+-+        .command = EXECUTE_SYNC,
+-+        .u.s.mask = 3,
+-+        .callback.func = callback,
+-+        .callback.cookie = (void *)&sync0
+-+      }
+-+    };
+++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
+++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
+ +
+-+    sem_init(&sync0, 0, 0);
+++        av_assert0(rfe->n < CFE_ENT_COUNT);
+ +
+-+    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+++        a->s[n].cmd = mode;
+++        a->s[n].handle = gm->vcsm_handle;
+++        a->s[n].addr = (unsigned int)gm->arm + offset;
+++        a->s[n].size = size;
+++        ++rfe->n;
+++    }
+++}
+ +
+-+    sem_wait(&sync0);
+++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
+++{
+++#if !RPI_ONE_BUF
+++#error Fixme! (NIF)
+ +#endif
+++  if (gpu_is_buf1(frame)) {
+++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
+ +  }
+-+  else {
+-+    while ((int32_t)(post_done - (uint32_t)id) < 0) {
+-+      usleep(1000);
+-+    }
+++  else
+++  {
+++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
+++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
+++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
+ +  }
+ +}
+ +
+-+
+-+unsigned int qpu_get_fn(int num) {
+-+    // Make sure that the gpu is initialized
+-+    unsigned int *fn;
+-+    if (gpu==NULL) {
+-+      printf("Preparing gpu\n");
+-+      gpu_lock();
+-+      gpu_unlock();
+++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
+++{
+++  const unsigned int y_offset = frame->linesize[0] * start_line;
+++  const unsigned int y_size = frame->linesize[0] * n;
+++  // Round UV up/down to get everything
+++  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
+++  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
+++  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
+++
+++  // As all unsigned they will also reject -ve
+++  // Test individually as well as added to reject overflow
+++  av_assert0(start_line <= (unsigned int)frame->height);
+++  av_assert0(n <= (unsigned int)frame->height);
+++  av_assert0(start_line + n <= (unsigned int)frame->height);
+++
+++  if (!gpu_is_buf1(frame))
+++  {
+++    if (do_luma) {
+++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
+ +    }
+-+    switch(num) {
+-+    case QPU_MC_SETUP:
+-+      fn = mc_setup;
+-+      break;
+-+    case QPU_MC_FILTER:
+-+      fn = mc_filter;
+-+      break;
+-+    case QPU_MC_EXIT:
+-+      fn = mc_exit;
+-+      break;
+-+    case QPU_MC_INTERRUPT_EXIT12:
+-+      fn = mc_interrupt_exit12;
+-+      break;
+-+    case QPU_MC_FILTER_B:
+-+      fn = mc_filter_b;
+-+      break;
+-+    //case QPU_MC_FILTER_HONLY:
+-+    //  fn = mc_filter_honly;
+-+    //  break;
+-+    case QPU_MC_SETUP_UV:
+-+      fn = mc_setup_uv;
+-+      break;
+-+    case QPU_MC_FILTER_UV:
+-+      fn = mc_filter_uv;
+-+      break;
+-+    case QPU_MC_FILTER_UV_B0:
+-+      fn = mc_filter_uv_b0;
+-+      break;
+-+    case QPU_MC_FILTER_UV_B:
+-+      fn = mc_filter_uv_b;
+-+      break;
+-+    case QPU_MC_INTERRUPT_EXIT8:
+-+      fn = mc_interrupt_exit8;
+-+      break;
+-+    case QPU_MC_END:
+-+      fn = mc_end;
+-+      break;
+-+    default:
+-+      printf("Unknown function\n");
+-+      exit(-1);
+++    if (do_chroma) {
+++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
+++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
+++    }
+++  }
+++  else if (!rpi_sliced_frame(frame))
+++  {
+++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+++    if (do_luma) {
+++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
+++    }
+++    if (do_chroma) {
+++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
+++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
+++    }
+++  }
+++  else
+++  {
+++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+++//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
+++    for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+++      if (do_luma) {
+++        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size);
+++      }
+++      if (do_chroma) {
+++        rpi_cache_flush_add_gm_range(rfe, gm, mode,
+++                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size);
+++      }
+ +    }
+-+    return gpu->vc + 4*(int)(fn-rpi_shader);
+-+    //return code[num] + gpu->vc;
+++  }
+ +}
+ +
+-+#if 0
+-+typedef unsigned int uint32_t;
+-+
+-+typedef struct mvs_s {
+-+    GPU_MEM_PTR_T unif_mvs_ptr;
+-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
+++// Call this to clean and invalidate a region of memory
+++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
+++{
+++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+++  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
+++  rpi_cache_flush_finish(rfe);
+++}
+ +
+-+    // _base pointers are to the start of the row
+-+    uint32_t *mvs_base[8];
+-+    // these pointers are to the next free space
+-+    uint32_t *u_mvs[8];
+ +
+-+} HEVCContext;
+++// ----------------------------------------------------------------------------
+ +
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+ +
+-+static void rpi_inter_clear(HEVCContext *s)
+++// Wait abstractions - mostly so we can easily add profile code
+++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
+ +{
+-+    int i;
+-+    for(i=0;i<8;i++) {
+-+        s->u_mvs[i] = s->mvs_base[i];
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 128;  // w
+-+        *s->u_mvs[i]++ = 128;  // h
+-+        *s->u_mvs[i]++ = 128;  // stride u
+-+        *s->u_mvs[i]++ = 128;  // stride v
+-+        s->u_mvs[i] += 3;  // Padding words
+-+    }
+++  unsigned int i;
+++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+++    sem_init(&wp->pool[i].sem, 0, 0);
+++    wp->pool[i].next = wp->pool + i + 1;
+++  }
+++  wp->head = wp->pool + 0;
+++  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
+ +}
+ +
+-+static void rpi_execute_inter_qpu(HEVCContext *s)
+++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
+ +{
+-+    int k;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+++  unsigned int i;
+++  wp->head = NULL;
+++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+++    sem_destroy(&wp->pool[i].sem);
+++    wp->pool[i].next = NULL;
+++  }
+++}
+ +
+-+    for(k=0;k<8;k++) {
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
+-+    }
+ +
+-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+
+-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+      );
+++// If sem_init actually takes time then maybe we want a pool...
+++static vq_wait_t * vq_wait_new(const unsigned int cost)
+++{
+++  gpu_env_t * const ge = gpu_lock_ref();
+++  vq_wait_t * const wait = ge->wait_pool.head;
+++  ge->wait_pool.head = wait->next;
+++  ge->current_load += cost;
+++  wait->cost = cost;
+++  wait->next = NULL;
+++
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++  tto_start(&ge->ttw.active, ns_time());
+++#endif
+++
+++  gpu_unlock();
+++  return wait;
+ +}
+ +
+-+void rpi_test_qpu(void)
+++static void vq_wait_delete(vq_wait_t * const wait)
+ +{
+-+    HEVCContext mvs;
+-+    HEVCContext *s = &mvs;
+-+    int i;
+-+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+-+    uint32_t *p;
+-+    printf("Allocate memory\n");
+-+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
+-+
+-+    // Set up initial locations for uniform streams
+-+    p = s->unif_mvs;
+-+    for(i = 0; i < 8; i++) {
+-+        s->mvs_base[i] = p;
+-+        p += uv_commands_per_qpu;
+++  gpu_env_t * const ge = gpu_lock();
+++  wait->next = ge->wait_pool.head;
+++  ge->wait_pool.head = wait;
+++
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++  {
+++    trace_time_wait_t * const ttw = &ge->ttw;
+++    const int64_t now = ns_time();
+++    ++ttw->jcount;
+++    tto_end(&ttw->wait, now);
+++
+++    if (ttw->start0 == 0)
+++    {
+++      ttw->start0 = ttw->active.start[0];
+++      ttw->last_update = ttw->start0;
+ +    }
+-+    // Now run a simple program that should just quit immediately after a single texture fetch
+-+    rpi_inter_clear(s);
+-+    for(i=0;i<4;i++) {
+-+      printf("Launch QPUs\n");
+-+      rpi_execute_inter_qpu(s);
+-+      printf("Done\n");
+++    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
+++    {
+++      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
+++      ttw_print(ttw, now);
+ +    }
+-+    printf("Free memory\n");
+-+    gpu_free(&s->unif_mvs_ptr);
+-+    return;
+-+}
+++  }
+ +#endif
+++  gpu_unlock_unref(ge);
+++}
+ +
+-+#if 0
+-+
+-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
+-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-+
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
+-+
+-+static uint8_t av_clip_uint8(int32_t a)
+++static void vq_wait_wait(vq_wait_t * const wait)
+ +{
+-+    if (a&(~255)) return (-a)>>31;
+-+    else          return a;
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++  {
+++      const int64_t now = ns_time();
+++      gpu_env_t * const ge = gpu_lock();
+++      tto_start(&ge->ttw.wait, now);
+++      gpu_unlock();
+++  }
+++#endif
+++
+++  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
+++    /* loop */;
+ +}
+ +
+-+static int32_t filter8(const uint8_t *data, int pitch)
+++static void vq_wait_post(vq_wait_t * const wait)
+ +{
+-+   int32_t vsum = 0;
+-+   int x, y;
+++#if !RPI_TRACE_TIME_VPU_QPU_WAIT
+++  if (wait->cost != 0)
+++#endif
+++  {
+++    gpu_env_t *const ge = gpu_lock();
+++    ge->current_load -= wait->cost;
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++    tto_end(&ge->ttw.active, ns_time());
+++#endif
+++    gpu_unlock();
+++  }
+ +
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+++  sem_post(&wait->sem);
+++}
+ +
+-+      for (x = 0; x < 8; x++)
+-+         hsum += hcoeffs[x]*data[x + y * pitch];
+ +
+-+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
+-+   }
+ +
+-+   return av_clip_uint8( (vsum + 64) >> 7);
+-+}
+++// Header comments were wrong for these two
+++#define VPU_QPU_MASK_QPU  1
+++#define VPU_QPU_MASK_VPU  2
+ +
+-+// Note regression changes coefficients so is not thread safe
+-+//#define REGRESSION
+-+#ifdef REGRESSION
+-+#define CMAX 100
+-+#else
+-+#define CMAX 2
+-+#endif
+-+#define YMAX 16
+++#define VPU_QPU_JOB_MAX 4
+++struct vpu_qpu_job_env_s
+++{
+++  unsigned int n;
+++  unsigned int mask;
+++  unsigned int cost;
+++  struct gpu_job_s j[VPU_QPU_JOB_MAX];
+++};
+ +
+-+int rpi_test_shader(void)
+++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
+++
+++vpu_qpu_job_env_t * vpu_qpu_job_new(void)
+ +{
+-+   int i, c;
+++  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
+++  return vqj;
+++}
+ +
+-+   uint32_t *unifs;
+++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
+++{
+++  memset(vqj, 0, sizeof(*vqj));
+++  free(vqj);
+++}
+ +
+-+   uint8_t *in_buffer;
+-+   uint8_t *out_buffer[2];
+++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
+++{
+++  struct gpu_job_s * const j = vqj->j + vqj->n++;
+++  av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
+++  return j;
+++}
+ +
+-+   GPU_MEM_PTR_T unifs_ptr;
+-+   GPU_MEM_PTR_T in_buffer_ptr;
+-+   GPU_MEM_PTR_T out_buffer_ptr[2];
+++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
+++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
+++{
+++  if (vpu_code != 0) {
+++    struct gpu_job_s *const j = new_job(vqj);
+++    vqj->mask |= VPU_QPU_MASK_VPU;
+++
+++    j->command = EXECUTE_VPU;
+++    j->u.v.q[0] = vpu_code;
+++    j->u.v.q[1] = r0;
+++    j->u.v.q[2] = r1;
+++    j->u.v.q[3] = r2;
+++    j->u.v.q[4] = r3;
+++    j->u.v.q[5] = r4;
+++    j->u.v.q[6] = r5;
+++  }
+++}
+ +
+-+   // Addresses in GPU memory of filter programs
+-+   uint32_t mc_setup = 0;
+-+   uint32_t mc_filter = 0;
+-+   uint32_t mc_exit = 0;
+++// flags are QPU_FLAGS_xxx
+++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
+++{
+++  if (n != 0) {
+++    struct gpu_job_s *const j = new_job(vqj);
+++    vqj->mask |= VPU_QPU_MASK_QPU;
+++    vqj->cost += cost;
+++
+++    j->command = EXECUTE_QPU;
+++    j->u.q.jobs = n;
+++#if RPI_TRACE_QPU_PROFILE_ALL
+++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
+++#else
+++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
+++#endif
+++    j->u.q.timeout = 5000;
+++    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  }
+++}
+ +
+-+   int pitch = 0x500;
+++// Convert callback to sem post
+++static void vpu_qpu_job_callback_wait(void * v)
+++{
+++  vq_wait_post(v);
+++}
+ +
+-+   if (gpu==NULL) {
+-+      gpu_lock();
+-+      gpu_unlock();
+-+   }
+++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
+++{
+++  vq_wait_t * wait;
+ +
+-+   printf("This needs to change to reflect new assembler\n");
+-+   // Use table to compute locations of program start points
+-+   mc_setup = code[0] + gpu->vc;
+-+   mc_filter = code[1] + gpu->vc;
+-+   mc_exit = code[2] + gpu->vc;
+++  if (vqj->mask == 0) {
+++    *wait_h = NULL;
+++    return;
+++  }
+ +
+-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+-+      return -2;
+-+   }
+-+   unifs = (uint32_t*)unifs_ptr.arm;
+++  // We are going to want a sync object
+++  wait = vq_wait_new(vqj->cost);
+ +
+-+   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
+-+      return -3;
+-+   }
+-+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
+++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
+++  // If we only posted one thing or only QPU jobs
+++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
+++  {
+++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
+++    av_assert0(j->callback.func == 0);
+ +
+-+   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
+-+      return -4;
+-+   }
+-+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
+-+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
+-+
+-+   for (c = 0; c < CMAX; c++) {
+-+      int xo[] = {rand()&31, rand()&31};
+-+
+-+#ifdef REGRESSION
+-+      for (i = 0; i < 8; i++) {
+-+         hcoeffs[i] = (int8_t)rand();
+-+         vcoeffs[i] = (int8_t)rand();
+-+         if (hcoeffs[i]==-128)
+-+           hcoeffs[i]++;
+-+         if (vcoeffs[i]==-128)
+-+           vcoeffs[i]++;
+-+      }
+-+#endif
+++    j->callback.func = vpu_qpu_job_callback_wait;
+++    j->callback.cookie = wait;
+++  }
+++  else
+++  {
+++    struct gpu_job_s *const j = new_job(vqj);
+ +
+-+      for (i = 0; i < 64*23; i++) {
+-+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
+-+         in_buffer[i] = rand();
+-+      }
+++    j->command = EXECUTE_SYNC;
+++    j->u.s.mask = vqj->mask;
+++    j->callback.func = vpu_qpu_job_callback_wait;
+++    j->callback.cookie = wait;
+++  }
+ +
+-+      // Clear output array
+-+      {
+-+        int b;
+-+        for(b=0;b<2;b++) {
+-+          for(i=0;i<16*16;i++) {
+-+            out_buffer[b][i] = 3;
+-+          }
+-+        }
+-+      }
+++  vqj->cost = 0;
+++  vqj->mask = 0;
+++  *wait_h = wait;
+++}
+ +
+-+      unifs[0] = mc_filter;
+-+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
+-+      unifs[2] = 64; // src pitch
+-+      unifs[3] = pitch; // dst pitch
+-+      unifs[4] = 0; // Padding
+-+      unifs[5] = 0;
+-+      unifs[6] = 0;
+-+      unifs[7 ] = mc_filter;
+-+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
+-+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+      unifs[13] = out_buffer_ptr[0].vc;
+-+      unifs[14] = mc_exit;
+-+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
+-+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+      unifs[20] = out_buffer_ptr[1].vc;
+-+
+-+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+-+
+-+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
+-+
+-+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
+-+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
+-+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
+-+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
+-+
+-+      if (1)
+-+      {
+-+         int x, y, b;
+-+         int bad = 0;
+-+
+-+         for (b=0; b<2; ++b)
+-+            for (y=0; y<YMAX; ++y)
+-+               for (x=0; x<16; ++x) {
+-+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
+-+
+-+                  if (out_buffer[b][x+y*pitch] != ref) {
+-+                      bad = 1;
+-+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
+-+                  }
+-+#ifndef REGRESSION
+-+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
+-+#endif
+-+               }
+-+          if (bad)
+-+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+-+          else
+-+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+-+      }
+-+      //printf("%d\n", simpenrose_get_qpu_tick_count());
+-+   }
+++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
+++{
+++  return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
+++}
+ +
+-+   gpu_free(&out_buffer_ptr[0]);
+-+   gpu_free(&out_buffer_ptr[1]);
+-+   gpu_free(&in_buffer_ptr);
+-+   gpu_free(&unifs_ptr);
+++// Simple wrapper of start + delete
+++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
+++{
+++  int rv;
+++  rv = vpu_qpu_job_start(vqj);
+++  vpu_qpu_job_delete(vqj);
+++  return rv;
+++}
+ +
+-+   return 0;
+++unsigned int vpu_qpu_current_load(void)
+++{
+++  return gpu_ptr()->current_load;
+ +}
+ +
+-+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
+++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
+ +{
+-+  int x,y;
+-+  for (y=0; y<16; ++y) {
+-+    for (x=0; x<16; ++x) {
+-+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
+++  if (wait_h != NULL)
+++  {
+++    vq_wait_t * const wait = *wait_h;
+++    if (wait != NULL) {
+++      *wait_h = NULL;
+++      vq_wait_wait(wait);
+++      vq_wait_delete(wait);
+ +    }
+ +  }
+ +}
+ +
+-+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
+++int vpu_qpu_init()
+ +{
+-+   uint32_t *unifs;
+-+
+-+   GPU_MEM_PTR_T unifs_ptr;
+-+   //uint8_t *out_buffer;
+-+   //GPU_MEM_PTR_T out_buffer_ptr;
+-+
+-+   // Addresses in GPU memory of filter programs
+-+   uint32_t mc_setup = 0;
+-+   uint32_t mc_filter = 0;
+-+   uint32_t mc_exit = 0;
+-+   //int x,y;
+++  gpu_env_t * const ge = gpu_lock_ref();
+++  if (ge == NULL)
+++    return -1;
+ +
+-+   if (gpu==NULL) {
+-+      gpu_lock();
+-+      gpu_unlock();
+-+   }
+++  if (ge->init_count++ == 0)
+++  {
+++    vc_gpuserv_init();
+++  }
+ +
+-+   // Use table to compute locations of program start points
+-+   mc_setup = code[0] + gpu->vc;
+-+   mc_filter = code[1] + gpu->vc;
+-+   mc_exit = code[2] + gpu->vc;
+++  gpu_unlock();
+++  return 0;
+++}
+ +
+-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+-+      return;
+-+   }
+-+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
+-+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
+++void vpu_qpu_term()
+++{
+++  gpu_env_t * const ge = gpu_lock();
+ +
+-+   /*for (y=0; y<16; ++y) {
+-+      for (x=0; x<16; ++x) {
+-+         out_buffer[x+y*dst_pitch] = 7;
+-+      }
+-+    }*/
+++  if (--ge->init_count == 0) {
+++    vc_gpuserv_deinit();
+ +
+-+   unifs = (uint32_t*)unifs_ptr.arm;
+-+
+-+    unifs[0] = mc_filter;
+-+    unifs[1] = (int)in_buffer_vc;
+-+    unifs[2] = src_pitch; // src pitch
+-+    unifs[3] = dst_pitch; // dst pitch
+-+    unifs[4] = 0; // Padding
+-+    unifs[5] = 0;
+-+    unifs[6] = 0;
+-+    unifs[7 ] = mc_exit;
+-+    unifs[8 ] = (int)in_buffer_vc;
+-+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+    unifs[13] = (int)dst_vc;
+-+    //unifs[13] = (int)out_buffer_ptr.vc;
+-+
+-+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+-+
+-+    qpu_run_shader(mc_setup, unifs_ptr.vc);
+-+
+-+    /*for (y=0; y<16; ++y) {
+-+      for (x=0; x<16; ++x) {
+-+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
+-+      }
+-+    }*/
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++    ttw_print(&ge->ttw, ns_time());
+++#endif
+++  }
+ +
+-+    gpu_free(&unifs_ptr);
+-+    //gpu_free(&out_buffer_ptr);
+++  gpu_unlock_unref(ge);
+ +}
+ +
+-+
+-+
+-+#endif
+++uint32_t qpu_fn(const int * const mc_fn)
+++{
+++  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
+++}
+ +
+ +#endif // RPI
+ diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+ new file mode 100644
+-index 0000000..c6cdb2b
++index 0000000..a95f7d9
+ --- /dev/null
+ +++ b/libavcodec/rpi_qpu.h
+-@@ -0,0 +1,176 @@
++@@ -0,0 +1,200 @@
+ +#ifndef RPI_QPU_H
+ +#define RPI_QPU_H
+ +
+-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+-+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
+-+#define RPI_FAST_CACHEFLUSH
+-+
+ +#define RPI_ONE_BUF 1
+ +
+ +typedef struct gpu_mem_ptr_s {
+@@ -12570,9 +15559,7 @@ index 0000000..c6cdb2b
+ +// General GPU functions
+ +extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+ +extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+-+extern void gpu_free(GPU_MEM_PTR_T *p);
+-+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
+-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+++extern void gpu_free(GPU_MEM_PTR_T * const p);
+ +
+ +#include "libavutil/frame.h"
+ +#if !RPI_ONE_BUF
+@@ -12615,29 +15602,31 @@ index 0000000..c6cdb2b
+ +    return av_buffer_get_opaque(frame->buf[0]);
+ +}
+ +
+-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
+++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
+ +{
+ +    return av_buffer_pool_opaque(frame->buf[n]);
+ +}
+ +
+++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
+++{
+++    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
+++    return gm->vc + (frame->data[n] - gm->arm);
+++}
+++
+ +
+ +static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
+++    return get_vc_address3(frame, 0);
+ +}
+ +
+ +static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ?
+-+        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
+-+        gpu_buf3_gmem(frame, 1)->vc;
+++    return get_vc_address3(frame, 1);
+ +}
+ +
+ +static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ?
+-+        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
+-+        gpu_buf3_gmem(frame, 2)->vc;
+++    return get_vc_address3(frame, 2);
+ +}
+ +
+-+
+++#if 0
+ +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+ +    if (gpu_is_buf1(frame))
+ +    {
+@@ -12674,48 +15663,74 @@ index 0000000..c6cdb2b
+ +    else
+ +        return *gpu_buf3_gmem(frame, 2);
+ +}
+-+
+++#endif
+ +#endif
+ +
+++// Cache flush stuff
+++
+++struct rpi_cache_flush_env_s;
+++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
+++
+++rpi_cache_flush_env_t * rpi_cache_flush_init(void);
+++// Free env without flushing
+++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
+++// Do the accumulated flush & free the env
+++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
+++
+++typedef enum
+++{
+++    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
+++    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
+++    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
+++} rpi_cache_flush_mode_t;
+++
+++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
+++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
+++  const unsigned int offset, const unsigned int size);
+++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
+++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
+++
+++// init, add, finish for one gm ptr
+++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+++
+ +
+ +// QPU specific functions
+-+extern void rpi_test_qpu(void);
+-+
+-+enum {
+-+  QPU_MC_SETUP,
+-+  QPU_MC_FILTER,
+-+  QPU_MC_EXIT,
+-+  QPU_MC_INTERRUPT_EXIT12,
+-+  QPU_MC_FILTER_B,
+-+  QPU_MC_FILTER_HONLY,
+-+  QPU_MC_SETUP_UV,
+-+  QPU_MC_FILTER_UV,
+-+  QPU_MC_FILTER_UV_B0,
+-+  QPU_MC_FILTER_UV_B,
+-+  QPU_MC_INTERRUPT_EXIT8,
+-+  QPU_MC_END
+-+  };
+-+extern unsigned int qpu_get_fn(int num);
+-+
+-+#define QPU_N_UV   8
+-+#define QPU_N_Y    12
+-+#define QPU_N_MAX  16
+++uint32_t qpu_fn(const int * const mc_fn);
+++
+++#define QPU_N_GRP_UV 4
+++#define QPU_N_UV     8
+++#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
+++#define QPU_N_Y      12
+++
+++#define QPU_MAIL_EL_VALS  2
+++
+++struct vpu_qpu_wait_s;
+++typedef struct vq_wait_s * vpu_qpu_wait_h;
+++
+++// VPU specific functions
+++
+++struct vpu_qpu_job_env_s;
+++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
+++
+++vpu_qpu_job_h vpu_qpu_job_new(void);
+++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
+++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
+++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
+++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
+++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
+++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
+++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
+ +
+-+#define QPU_MAIL_EL_VALS  2
+-+#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
+-+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
+-+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
+ +
+-+// VPU specific functions
+ +extern unsigned int vpu_get_fn(void);
+ +extern unsigned int vpu_get_constants(void);
+-+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+    int qpu0_n, const uint32_t * qpu0_mail,
+-+    int qpu1_n, const uint32_t * qpu1_mail);
+ +
+-+extern void vpu_wait( int id);
+++// Waits for previous post_codee to complete and Will null out *wait_h after use
+++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
+++unsigned int vpu_qpu_current_load(void);
+++int vpu_qpu_init(void);
+++void vpu_qpu_term(void);
+ +
+ +// Simple test of shader code
+ +extern int rpi_test_shader(void);
+@@ -12724,14 +15739,16 @@ index 0000000..c6cdb2b
+ +extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+ +
+ +extern int gpu_get_mailbox(void);
+++void gpu_ref(void);
+++void gpu_unref(void);
+ +
+ +#endif
+ diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+ new file mode 100644
+-index 0000000..06fb166
++index 0000000..0898ecd
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.c
+-@@ -0,0 +1,629 @@
++@@ -0,0 +1,670 @@
+ +#include "rpi_shader.h"
+ +
+ +#ifdef _MSC_VER
+@@ -12755,607 +15772,648 @@ index 0000000..06fb166
+ +__attribute__((aligned(8)))
+ +#endif
+ +unsigned int rpi_shader[] = {
+-+// ::mc_setup_uv
+-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
+-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
+-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
+-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
+-+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+-+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+-+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+-+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+-+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+-+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
+-+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
+-+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
+-+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
+-+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
+-+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+-+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+++// ::mc_setup_c
+++/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1          ; mov -, unif
+++/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+++/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1
+++/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+++/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+++/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+++/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+++/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
+++/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+++/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+++/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+++/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+++/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
+++/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
+++/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num      ; mov ra14, 0
+++/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0
+++/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b
+++/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+++/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1
+++/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4
+++/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
+++/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
+++/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
+++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1        ; mov r1, ra_y
+++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+++/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+++/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+++/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+++/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
+++/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+++/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+++/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+++/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
+++/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
+++/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+++/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5
+++/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+++/* [0x00000160] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+++/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+++/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+++/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+++/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a
+++/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b
+++/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+++/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0         ; mov -, unif
+++/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
+++/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1         ; mov -, unif
+++/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4
+++/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
+++/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
+++/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
+++/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_y2
+++/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+++/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+++/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+++/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+++/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif
+++/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0         ; mov -, unif
+++/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y  ; mov -, unif
+++/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+++/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0
+ +// ::mc_filter_uv
+-+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
+-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+-+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
+-+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
+-+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
+-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+-+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+-+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+-+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+-+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+-+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+-+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
+-+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
+-+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
+-+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
+-+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+-+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+-+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+-+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+++/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif     ; mov vw_setup, rb28
+++/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+++/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+++/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+++/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
+++/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
+++/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
+++/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+++/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
+++/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
+++/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
+++/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
+++/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1       ; mov ra1, unif
+++/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3       ; mov.ifnz ra1, unif
+++/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0,   r0, r2      ; mov rb8,  ra3.8a
+++/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0,   r0, 15      ; mov rb9,  ra3.8b
+++/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27    ; mov r1, ra1.16b
+++/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13      ; mov rb10, ra3.8c
+++/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+++/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+++/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+ +// :uvloop
+-+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+-+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+-+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
+++/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+++/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
+++/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+++/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+++/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
+++/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+++/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+++/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+++/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+++/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+++/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+++/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+++/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
+++/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
+++/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop
+++/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
+++/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+++/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
+++/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra14, rb10
+++/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra15, rb11
+++/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
+++/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop                   ; mul24 r1, r1, rb14
+++/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13
+++/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
+++/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop
+++/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13
+++/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1
+++/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+ +// ::mc_filter_uv_b0
+-+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
+-+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
+-+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
+-+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
+-+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
+-+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+-+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+-+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+-+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+-+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+-+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
+-+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
+-+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
+-+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
+-+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
+-+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
+-+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
+++/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif           ; mov vw_setup, rb28
+++/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+++/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+++/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+++/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
+++/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
+++/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
+++/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+++/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
+++/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
+++/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
+++/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
+++/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1
+++/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3       ; mov rb8,  ra3.8a
+++/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0,   r0, r2      ; mov rb9,  ra3.8b
+++/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0,   r0, 15      ; mov rb10, ra3.8c
+++/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+++/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+++/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif
+++/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif
+ +// :uvloop_b0
+-+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+-+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+-+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
+-+// ::mc_filter_uv_b
+-+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+-+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
+-+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
+-+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
+-+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
+-+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+-+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+-+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+-+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+-+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
+-+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+-+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
+-+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
+-+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
+-+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
+-+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
+-+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+-+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+++/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
+++/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+++/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
+++/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+++/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+++/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
+++/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+++/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+++/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+++/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+++/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+++/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+++/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+++/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
+++/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
+++/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+++/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
+++/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
+++/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
+++/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
+++/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
+++/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
+++/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
+++/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
+++/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+++/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
+++/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
+++/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
+++/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
+++/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
+++/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
+++/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16     ; mov ra_link, unif
+++/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
+++/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
+++/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
+++/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
+++/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
+++/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
+++/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
+++/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
+++/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
+++/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
+++/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
+++/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
+++/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
+++// :uv_b0_post12
+++/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
+++/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
+++/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
+++/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
+++/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
+++/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
+++// :uv_b0_post_fin
+++/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+++/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+++/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+++/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
+++/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
+++/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4
+++/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+++/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0        ; mov ra_y2_next, ra2.16a
+++/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
+++/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
+++/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+++/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif         ; mov rb9,  ra3.8b
+++/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif    ; mov rb10, ra3.8c
+++/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop                   ; mov rb11, ra3.8d
+++/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3
+++/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+ +// :uvloop_b
+-+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+-+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
+-+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+-+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+-+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
+-+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+-+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
+-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_exit
+-+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit8
+-+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1
+++/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
+++/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y2
+++/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
+++/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
+++/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
+++/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
+++/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
+++/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
+++/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
+++/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
+++/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
+++/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
+++/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+++/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b
+++/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+++/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
+++/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+++/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
+++/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
+++/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
+++/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
+++/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
+++/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
+++/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
+++/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
+++/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
+++/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
+++/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
+++/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
+++/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+++/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13
+++/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
+++/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3
+++/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_interrupt_exit8c
+++/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov  -, vw_wait ; nop ; ldtmu0
+++/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+ +/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+ +/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+ +/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+ +/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_exit
+++// ::mc_exit_c
+++/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
+++/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+++/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_interrupt_exit12
+++/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
+++/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_exit1
+++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop        ; nop
+ +// ::mc_setup
+-+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
+-+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
+-+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
+-+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+-+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+-+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
+-+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
+-+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
+-+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
+-+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
+-+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+-+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
+-+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
+-+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+-+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
+-+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+-+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+-+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+-+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+-+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+-+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+-+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+++/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
+++/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif
+++/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+++/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+++/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+++/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
+++/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+++/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or  rb24, r1, rb_pitch
+++/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+++/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
+++/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+++/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+++/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b
+++/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1
+++/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+++/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+++/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1
+++/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
+++/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+++/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b
+++/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+++/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+++/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+++/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1
+++/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+++/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+++/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+++/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
+++/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+++/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+++/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+++/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+++/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+++/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+++/* [0x00000be0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+++/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0
+++/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1
+++/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
+++/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base
+++/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+++/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+++/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+++/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2
+ +// :per_block_setup
+-+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
+-+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+-+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
+-+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+-+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+-+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
+-+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+-+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+-+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
+-+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
+-+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+-+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+-+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+-+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+-+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
+-+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
+-+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+-+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+-+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+-+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
+-+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+-+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+-+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+-+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+-+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+-+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+-+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+-+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+-+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+-+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+-+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+-+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+-+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+-+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+-+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+-+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+-+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+-+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+-+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
+-+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
+-+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
+-+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
+-+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
+-+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
+-+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
+++/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif         ; mov r3, elem_num
+++/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next
+++/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next
+++/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
+++/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+++/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0
+++/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b
+++/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif
+++/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
+++/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0
+++/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
+++/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif
+++/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+++/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width
+++/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5  ; mov r0, ra_height
+++/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16
+++/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+++/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+++/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
+++/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
+++/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+++/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+++/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
+++/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
+++/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
+++/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+++/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+++/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+++/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
+++/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
+++/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+++/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+++/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+++/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+++/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+++/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+++/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+++/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+++/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+++/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+++/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+++/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+++/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+++/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+++/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+++/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+++/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
+++/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
+++/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
+++/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
+++/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
+++/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
+++/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
+++/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
+++/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
+++/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
+ +// ::mc_filter
+-+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
+++/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
+ +// :yloop
+-+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+-+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+-+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+-+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+-+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+-+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+-+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+-+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+-+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+-+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+-+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+-+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+-+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+-+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+-+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+-+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+-+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+-+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+-+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+-+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+-+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+-+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+-+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+-+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
+-+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+++/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
+++/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
+++/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+++/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+++/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+++/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+++/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+++/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+++/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+++/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+++/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+++/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+++/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+++/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+++/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+++/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+++/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16
+++/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
+++/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0
+++/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
+++/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup
+++/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
+++/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+++/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
+++/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
+++/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
+++/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
+++/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
+++/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
+++/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+++/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop
+++/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop
+ +// ::mc_filter_b
+-+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
+ +// :yloopb
+-+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+-+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+-+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+-+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+-+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+-+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+-+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+-+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+-+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+-+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+-+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+-+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+-+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+-+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+-+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+-+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+-+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+-+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+-+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+-+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+-+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+-+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+-+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+-+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+-+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+-+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+-+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
+-+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_interrupt_exit12
+-+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit1
+-+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+++/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
+++/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
+++/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+++/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00001188] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+++/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+++/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+++/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+++/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+++/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+++/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+++/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+++/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+++/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+++/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+++/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+++/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
+++/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+++/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+++/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16
+++/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
+++/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0
+++/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
+++/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup
+++/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
+++/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+++/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
+++/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
+++/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
+++/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
+++/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
+++/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
+++/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+++/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb
+++/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop
+ +// ::mc_end
+ +};
+ +#ifdef __HIGHC__
+@@ -13363,7 +16421,7 @@ index 0000000..06fb166
+ +#endif
+ diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+ new file mode 100644
+-index 0000000..9772796
++index 0000000..d17b9fd
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.h
+ @@ -0,0 +1,19 @@
+@@ -13372,26 +16430,33 @@ index 0000000..9772796
+ +
+ +extern unsigned int rpi_shader[];
+ +
+-+#define mc_setup_uv (rpi_shader + 0)
+-+#define mc_filter_uv (rpi_shader + 132)
+-+#define mc_filter_uv_b0 (rpi_shader + 274)
+-+#define mc_filter_uv_b (rpi_shader + 392)
+-+#define mc_exit (rpi_shader + 540)
+-+#define mc_interrupt_exit8 (rpi_shader + 558)
+-+#define mc_setup (rpi_shader + 588)
+-+#define mc_filter (rpi_shader + 872)
+-+#define mc_filter_b (rpi_shader + 992)
+-+#define mc_interrupt_exit12 (rpi_shader + 1114)
+-+#define mc_exit1 (rpi_shader + 1152)
+-+#define mc_end (rpi_shader + 1168)
+++#define mc_setup_c (rpi_shader + 0)
+++#define mc_filter_uv (rpi_shader + 152)
+++#define mc_filter_uv_b0 (rpi_shader + 280)
+++#define mc_interrupt_exit8c (rpi_shader + 554)
+++#define mc_exit (rpi_shader + 582)
+++#define mc_exit_c (rpi_shader + 582)
+++#define mc_interrupt_exit12 (rpi_shader + 598)
+++#define mc_exit1 (rpi_shader + 634)
+++#define mc_setup (rpi_shader + 650)
+++#define mc_filter (rpi_shader + 942)
+++#define mc_filter_b (rpi_shader + 1094)
+++#define mc_end (rpi_shader + 1246)
+ +
+ +#endif
+ diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+ new file mode 100644
+-index 0000000..aa9e1e7
++index 0000000..aa3fe47
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.qasm
+-@@ -0,0 +1,1098 @@
++@@ -0,0 +1,1259 @@
+++
+++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
+++# the warning that we are using rotation & ra/rb registers. r0..3 can be
+++# rotated through all 16 elems ra regs can only be rotated through their
+++# local 4.  As it happens this is what is wanted here as we do not want the
+++# constants from the other half of the calc.
+++
+ +# register allocation
+ +#
+ +# ra0...ra7                                     eight horizontal filter coefficients
+@@ -13408,32 +16473,32 @@ index 0000000..aa9e1e7
+ +#
+ +# rb8...rb11                                    eight vertical filter coefficients
+ +
+-+# ra4                                           y: Fiter, UV: 0x10000
+++# ra4                                           y: Fiter, UV: part -of b0 -> b stash
+ +
+ +# rb12                                          offset to add before shift (round + weighting offsets)
+ +# rb13                                          shift: denom + 6 + 9
+ +# rb14                                          L0 weight (U on left, V on right)
+ +# rb15                                          -- free --
+ +#
+-+# ra16                                          clipped(row start address+elem_num)&~3
+-+# ra17                                          per-channel shifts
+++# ra16                                          width:height
+++# ra17                                          ra_y:ra_xshift
+ +# ra18                                          L1 weight (Y)
+-+# ra19                                          next ra17
+++# ra19                                          ra_y_next:ra_xshift_next
+ +#
+ +# rb16                                          pitch
+ +# rb17                                          height + 1
+-+# rb18                                          height + 3
+-+# rb19                                          next ra16
+++# rb18                                          max(height,16) + 3
+++# rb19                                          frame_base2_next
+ +#
+ +# ra20                                          1
+-+# ra21                                          ra_21
+++# ra21                                          ra_y2_next:ra_y2 (luma); free (chroma)
+ +# ra22 ra_k256                                  256
+-+# ra23 ra_y2_next                               ra_y2_next
+++# ra23                                          0
+ +#
+-+# rb20                                          0xffffff00
+-+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
+++# rb20                                          -- free --
+++# rb21                                          -- free --
+ +# rb22 rb_k255                                  255
+-+# rb23                                          24
+++# rb23                                          dest (Y)
+ +#
+ +# rb24                                          vdw_setup_1(dst_pitch)
+ +# rb25                                          frame width-1
+@@ -13444,146 +16509,233 @@ index 0000000..aa9e1e7
+ +# rb30                                          frame height-1
+ +# rb31                                          used as temp to count loop iterations
+ +#
+-+# ra24                                          clipped(row start address+8+elem_num)&~3
+-+# ra25                                          per-channel shifts 2
+++# ra24                                          src frame base
+++# ra25                                          src frame base 2
+ +# ra26                                          next ra24
+ +# ra27                                          next ra25
+-+# ra28                                          next y
+-+# ra29                                          y for next texture access
+-+# ra30                                          64
+++# ra28                                          -- free --
+++# ra29                                          -- free --
+ +#
+-+# ra31                                          next kernel address
+++# Use an even numbered register as a link register to avoid corrupting flags
+++# ra30                                          next kernel address
+++# ra31                                          chroma-B height+3; free otherwise
+ +
+-+.set rb_frame_width_minus_1,       rb25
+-+.set rb_frame_height_minus_1,      rb30
+++.set rb_max_x,                     rb25
+++.set rb_max_y,                     rb30
+ +.set rb_pitch,                     rb16
+-+.set ra_x,                         ra16
+++.set ra_width_height,              ra16
+++.set ra_width,                     ra16.16b
+++.set ra_height,                    ra16.16a
+ +.set ra_y2,                        ra21.16a
+ +.set ra_y2_next,                   ra21.16b
+ +
+-+.set rb_x_next,                    rb19
+-+.set rx_frame_base2_next,          rb19
+++.set rb_base2_next,                rb19
+ +
+-+.set ra_frame_base,                ra24
+-+.set ra_frame_base_next,           ra26
+-+.set ra_xshift,                    ra17
+++.set rb_dest,                      rb23
+++.set ra_base,                      ra24
+++.set ra_base_next,                 ra26
+++.set ra_xshift,                    ra17.16a
+ +
+-+.set ra_u2v_ref_offset,            ra25
+-+.set ra_frame_base2,               ra25
+++.set ra_base2,                     ra25
+ +
+-+.set ra_xshift_next,               ra19
+-+.set rx_xshift2,                   rb0
+-+.set rx_xshift2_next,              rb1
+++# Note ra_xy & ra_xy_next should have same structure!
+++.set ra_xshift_next,               ra19.16a
+++.set rb_xshift2,                   rb0
+++.set rb_xshift2_next,              rb1
+ +
+-+.set ra_u2v_dst_offset,            ra27
+-+
+-+.set ra_y_next,                    ra28
+-+.set ra_y,                         ra29
+++.set ra_y_next,                    ra19.16b
+++.set ra_y,                         ra17.16b
+ +
+ +.set ra_k1,                        ra20
+++.set rb_xpitch,                    rb20
+ +.set rb_k255,                      rb22
+ +.set ra_k256,                      ra22
+++.set ra_k0,                        ra23
+++
+++.set ra_link,                      ra30
+ +
+ +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+ +.set i_shift16,                    -16
+ +.set i_shift21,                    -11
+++.set i_shift23,                     -9
+++.set i_shift30,                     -2
+ +
+-+################################################################################
+-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+-+::mc_setup_uv
+++# Much of the setup code is common between Y & C
+++# Macros that express this - obviously these can't be overlapped
+++# so are probably unsuitable for loop code
+++
+++.macro m_calc_dma_regs, r_vpm, r_dma
+++  mov r2, qpu_num
+++  asr r1, r2, 2
+++  shl r1, r1, 6
+++  and r0, r2, 3
+++  or  r0, r0, r1
+ +
+-+# Read starting kernel
+-+mov ra31, unif
+++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+++  add r_vpm, r0, r1  # VPM 8bit storage
+++
+++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+++  shl r0, r0, 5
+++  add r_dma, r0, r1  # DMA out
+++.endm
+++
+++# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16
+++.macro m_calc_dma_regs_c, r_vpm, r_dma
+++  mov r2, qpu_num
+++  asr r1, r2, 1
+++  shl r1, r1, 5
+++  and r0, r2, 1
+++  or  r0, r0, r1
+++
+++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+++  add r_vpm, r0, r1  # VPM 8bit storage
+++
+++  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
+++  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
+++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
+++  shl r0, r0, 6
+++  add r_dma, r0, r1  # DMA out
+++.endm
+++
+++
+++################################################################################
+++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+++::mc_setup_c
+++  mov tmurs, 1          ; mov -, unif        # No swap TMUs ; Next fn (ignored)
+ +
+ +# Load first request location
+-+add ra_x, unif, elem_num # Store x
+-+mov ra_y, unif # Store y
+-+mov ra_frame_base, unif # Store frame u base
+-+nop
+-+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+++  mov ra0, unif         # next_x_y
+++
+++  mov ra_base, unif                             # Store frame c base
+ +
+ +# Read image dimensions
+-+sub rb25,unif,1
+-+sub rb30,unif,1
+++  sub rb_max_x, unif, 1     # pic c width
+++  sub rb_max_y, unif, 1     # pic c height
+++
+++# load constants
+++  mov ra_k1, 1
+++  mov ra_k256, 256
+++  mov rb_k255, 255
+++  mov ra_k0, 0
+++
+++# touch registers to keep simulator happy
+++
+++  # ra/b4..7: B0 -> B stash registers
+++  mov ra4, 0 ; mov rb4, 0
+++  mov ra5, 0 ; mov rb5, 0
+++  mov ra6, 0 ; mov rb6, 0
+++  mov ra7, 0 ; mov rb7, 0
+++
+++  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
+ +
+++# ; ra12..15: vertical scroll registers
+ +# get source pitch
+-+mov rb16, unif
+++  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
+++  mov rb_pitch, unif    ; mov ra13, 0           # stride1
+++  mov r0, elem_num      ; mov ra14, 0
+++# get destination vdw setup
+++  add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
+ +
+-+# get destination pitch
+-+mov r0, unif
+-+mov r1, vdw_setup_1(0)
+-+add rb24, r1, r0
+++# Compute base address for first and second access
+++# ra_base ends up with t0s base
+++# ra_base2 ends up with t1s base
+ +
+-+# load constants
+++  add r0, r0, ra0.16b                           # Add elem no to x to get X for this slice
+++  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
+++  min r0, r0, rb_max_x
+ +
+-+mov ra4, 0x10000
+-+mov ra_k1, 1
+-+mov ra_k256, 256
+-+mov ra30, 64
+++# Get shift
+++  and r1, r0, 1
+++  shl ra_xshift_next, r1, 4
+ +
+-+mov rb20, 0xffffff00
+-+mov rb_k255, 255
+-+mov rb23, 24
+++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+ +
+-+# touch vertical context to keep simulator happy
+++  and r0, r0, -2
+++  add r0, r0, r0        ; v8subs r1, r1, r1
+++  sub r1, r1, rb_pitch
+++  and r1, r0, r1
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov r1, ra_y
+++  add ra_base, ra_base, r0
+ +
+-+mov ra8, 0
+-+mov ra9, 0
+-+mov ra10, 0
+-+mov ra11, 0
+-+mov ra12, 0
+-+mov ra13, 0
+-+mov ra14, 0
+-+mov ra15, 0
+++  max r0, r1, 0
+++  min r0, r0, rb_max_y
+ +
+-+# Compute base address for first and second access
+-+mov r0, ra_x           # Load x
+-+max r0, r0, 0; mov r1, ra_y # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
+-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+add ra_y, r1, 1
+-+add r0, r0, r3
+-+and r0, r0, ~3
+-+max r1, r1, 0 ; mov ra_x, r0 # y
+-+min r1, r1, rb_frame_height_minus_1
+ +# submit texture requests for first line
+-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+add t0s, r0, r1 ; mov ra_frame_base, r2
+-+add t1s, r2, r1
+++  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+++  add t0s, ra_base, r0
+++
+++# submit texture requests for 2nd line
+++
+++  max r0, r1, 0
+++  min r0, r0, rb_max_y
+++
+++  add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+++  add t0s, ra_base, r0
+ +
+-+mov r2, 9
+-+add rb13, r2, unif  # denominator
+-+mov -, unif         # Unused
+++  add rb13, 9, unif     # denominator
+++  mov -, unif           # Unused
+ +
+ +# Compute part of VPM to use for DMA output
+-+mov r2, unif
+-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+add rb28, r0, r1  # VPM 8bit storage
+-+asr r2, r0, 1     # r0 = bc0000d
+-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+-+add rb21, r2, r1  # VPM for 16bit intermediates
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1  # DMA out
+++  m_calc_dma_regs_c rb28, rb27
+++
+++# -----------------
+++# And again for L1, but only worrying about frame2 stuff
+++
+++  mov ra_link, unif        # Next fn
+++
+++# Load first request location
+++  mov ra0, unif            # next_x_y
+++
+++  mov ra_base2, unif # Store frame c base
+++
+++# Compute base address for first and second access
+++# ra_base ends up with t0s base
+++# ra_base2 ends up with t1s base
+++
+++  mov ra_y2, ra0.16a       # Store y
+++  mov r0, ra0.16b          # Load x
+++  add r0, r0, elem_num     # Add QPU slice
+++  max r0, r0, 0         ; mov -, unif           # Unused 0
+++  min r0, r0, rb_max_x  ; mov -, unif           # Unused 1
+++
+++# Get shift
+++  and r1, r0, 1         ; mov -, unif           # Unused 2
+++  shl rb_xshift2_next, r1, 4
+++
+++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+++
+++  and r0, r0, -2
+++  add r0, r0, r0        ; v8subs r1, r1, r1
+++  sub r1, r1, rb_pitch
+++  and r1, r0, r1
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov r1, ra_y2
+++  add ra_base2, ra_base2, r0
+++
+++  max r0, r1, 0
+++  min r0, r0, rb_max_y
+++
+++# submit texture requests for first line
+++  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+++  add t1s, ra_base2, r0 ; mov -, unif           # Unused 3
+++
+++# submit texture requests for 2nd line
+++
+++  max r0, r1, 0         ; mov -, unif           # Unused 4
+++
+++  bra -, ra_link
+++
+++  min r0, r0, rb_max_y  ; mov -, unif           # Unused 5
+++  add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+++  add t1s, ra_base2, r0
+++
+++# >>> ra_link
+ +
+-+# submit texture requests for second line
+-+max r1, ra_y, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1
+-+bra -, ra31
+-+nop ; mul24 r1, r1, rb_pitch
+-+add t0s, r1, ra_x
+-+add t1s, r1, ra_frame_base
+ +
+++.macro setf_nz_if_v
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++.endm
+ +
+ +
+ +################################################################################
+@@ -13593,51 +16745,51 @@ index 0000000..aa9e1e7
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# ra_x, ra_x16_base point to the current coordinates for this block
+ +::mc_filter_uv
+-+mov ra31, unif
+++  mov ra_link, unif     ; mov vw_setup, rb28    # ; x_y
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0         ; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+# compute offset from frame base u to frame base v
+-+sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+-+shl ra_xshift_next, r0, 3
+-+add r0, r0, r3        ; mov ra1, unif  # ; width_height
+-+and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
+-+mov ra_y_next, r1     ; mov vw_setup, rb28
+-+add ra_frame_base_next, rb_x_next, r2
+++  mov ra2, unif         ; mov r0, elem_num
+++
+++  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++
+++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
+++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
+++
+++  shl ra_xshift_next, r0, 4
+++
+++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+++  add r0, r0, r0        ; mov ra_y_next, ra2.16a
+++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
+++  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+ +
+ +# set up VPM write
+-+# get width,height of block
+ +
+-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+-+add rb17, ra1.16a, 1
+-+add rb18, ra1.16a, 3
+-+shl r0,   ra1.16a, 7
+-+add r0,   r0, ra1.16b    # Combine width and height of destination area
+-+shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
+++  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
+++  add rb17, r1, 1       ; mov ra1, unif         # ; U offset/weight
+++  add rb18, r1, 3       ; mov.ifnz ra1, unif    # ; V offset/weight
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++# ; unpack filter coefficients
+ +
+-+# unpack filter coefficients
+++  add r0,   r0, r2      ; mov rb8,  ra3.8a      # Combine width and height of destination area
+++  shl r0,   r0, 15      ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb26, r0, rb27    ; mov r1, ra1.16b       # ; r1=weight
+ +
+-+mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
+-+mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
+-+nop                   ; mov rb10, ra3.8c
+-+mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+++  shl r1, r1, rb13      ; mov rb10, ra3.8c
+++  mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+ +
+-+shl r1, ra1.16b, rb13
+-+asr rb12, r1, 1
+-+shl rb14, ra1.16a, 1  # b14 = weight*2
+++  asr rb12, r1, 1
+++  shl rb14, ra1.16a, 1  # b14 = weight*2
+ +
+ +# rb14 - weight L0 * 2
+ +# rb13 = weight denom + 6 + 9
+ +# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+ +
+-+# r2 is elem_num
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+@@ -13646,123 +16798,114 @@ index 0000000..aa9e1e7
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
+++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+++  shr r1, r0, 8         ; mov.ifnz r3, ra_y
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t1s, ra_frame_base, r2
+++  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+++  min r2, r2, rb_max_y
+++  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++  setf_nz_if_v
+ +
+ +# apply horizontal filter
+-+nop                  ; mul24      r3, ra0.8a,       r0
+-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+sub r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+-+brr.anyn -, r:uvloop
+-+mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+mov ra14, ra15
+-+mov ra15, r0            ; mul24 r0, ra12, rb8
+++# The filter coeffs for the two halves of this are the same (unlike in the
+++# Y case) so it doesn't matter which ra0 we get them from
+++
+++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+++  sub r0, r2, r3        ; mov r3, rb31
+++  sub.setf -, r3, 4     ; mov ra12, ra13
+++  brr.anyn -, r:uvloop
+++  mov ra13, ra14        ; mul24 r1, ra14, rb9
+++  mov ra14, ra15
+++  mov ra15, r0          ; mul24 r0, ra12, rb8
+ +# >>> .anyn uvloop
+ +
+ +# apply vertical filter and write to VPM
+ +
+-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+sub r1, r1, r0          ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+asr r1, r1, 14
+-+nop                     ; mul24 r1, r1, rb14
+-+shl r1, r1, 8
+-+
+-+add r1, r1, rb12
+-+brr.anyn -, r:uvloop
+-+asr r1, r1, rb13
+-+min r1, r1, rb_k255       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out for U
+-+
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+-+
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+++  sub r1, r1, r0        ; mul24 r0, ra14, rb10
+++  add r1, r1, r0        ; mul24 r0, ra15, rb11
+++  sub r1, r1, r0
+++  sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
+++  asr r1, r1, 14
+++  nop                   ; mul24 r1, r1, rb14
+++  shl r1, r1, 8
+ +
+++  add r1, r1, rb12
+++  asr ra1.8as, r1, rb13
+++  nop                   ; mov r1, r1 << 8
+++  brr.anyn -, r:uvloop
+++  asr ra1.8bs, r1, rb13
+++  mov -, vw_wait
+++  mov vpm, ra1
+++
+++# >>>
+++
+++# DMA out for U & stash for V
+++  bra -, ra_link
+++  mov vw_setup, rb26
+++  mov vw_setup, rb29
+++  mov vw_addr, unif     # u_dst_addr
+++# >>>
+ +
+ +################################################################################
+ +
+-+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+++# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+ +
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# ra_x, ra_x16_base point to the current coordinates for this block
+ +::mc_filter_uv_b0
+-+mov ra31, unif
+++  mov -, unif           ; mov vw_setup, rb28    # next_fn ignored - always uv_b
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num       # x
+-+max r0, r0, 0                ; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
+-+shl ra_xshift_next, r0, 3
+-+add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
+-+and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
+-+mov ra_y_next, r1            ; mov vw_setup, rb21
+-+
+-+add ra_frame_base_next, rb_x_next, r2
+-+
+-+# Need to have unsigned coeffs to so we can just unpack in the filter
+-+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
+-+# filter code. Unpack into b regs for V
+-+
+-+# set up VPM write, we need to save 16bit precision
+-+
+-+sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
+-+add rb17, ra1.16a, 1
+-+add rb18, ra1.16a, 3
+-+shl r0,   ra1.16a, 7
+-+add r0,   r0, ra1.16b           # Combine width and height of destination area
+-+shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
+-+add rb26, r0, rb27
+-+
+-+mov rb8, ra3.8a
+-+mov rb9, ra3.8b
+-+mov rb10, ra3.8c
+-+mov rb11, ra3.8d
+-+
+-+# r2 is elem_num
+-+# r3 is loop counter
+++  mov ra2, unif         ; mov r0, elem_num
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++
+++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
+++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
+++
+++  shl ra_xshift_next, r0, 4
+++
+++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+++  add r0, r0, r0        ; mov ra_y_next, ra2.16a
+++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
+++  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++
+++# set up VPM write
+++
+++  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
+++  add rb17, r1, 1
+++  add ra31, r1, 3       ; mov rb8,  ra3.8a      # Combine width and height of destination area
+++
+++# ; unpack filter coefficients
+++
+++  add r0,   r0, r2      ; mov rb9,  ra3.8b
+++  shl r0,   r0, 15      ; mov rb10, ra3.8c      # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb26, r0, rb27
+++
+++  mov r3, 0             ; mov rb11, ra3.8d      # Loop count
+++
+++  mov rb14, unif                                # U weight
+++  mov.ifnz rb14, unif                           # V weight
+ +
+-+mov      rb14, unif                 # U weight L0
+-+mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
+ +# rb14 unused in b0 but will hang around till the second pass
+ +
+ +# retrieve texture results and pick out bytes
+@@ -13773,108 +16916,143 @@ index 0000000..aa9e1e7
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
+++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+++  shr r1, r0, 8         ; mov.ifnz r3, ra_y
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t1s, ra_frame_base, r2
+++  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+++  min r2, r2, rb_max_y
+++  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+nop                  ; mul24      r3, ra0.8a,       r0
+-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+sub r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+-+brr.anyn -, r:uvloop_b0
+-+mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
+-+mov ra14, ra15
+-+mov ra15, r0            ; mul24 r0, ra12, rb8
+-+# >>> .anyn uvloop_b0
+-+
+-+# apply vertical filter and write to VPM
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+sub.setf -, r3, rb18
+-+brr.anyn -, r:uvloop_b0
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+sub r1, r1, r0          ; mov -, vw_wait
+-+asr vpm, r1, 6
+++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0  # Need to wait 1 cycle for rotated r1
+++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+++  sub r0, r2, r3        ; mov r3, rb31
+++  sub.setf -, r3, 4     ; mov ra12, ra13
+++  brr.anyn -, r:uvloop_b0
+++  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
+++  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
+++  mov ra15, r0          ; mul24 r0, ra12, rb8
+ +# >>> .anyn uvloop_b0
+ +
+-+# in pass0 we don't really need to save any results, but need to discard the uniforms
+-+# DMA out for U
+-+
+-+bra -, ra31
+-+mov -, unif           # Delay 1
+-+mov -, unif           # Delay 2
+-+nop                   # Delay 3
+-+
+-+
+-+################################################################################
+-+
+-+::mc_filter_uv_b
+-+mov ra31, unif
+++# apply vertical filter and write to B-FIFO
+++
+++  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
+++  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
+++  sub r1, r1, r0        ; mov ra7, rb6
+++
+++# FIFO goes:
+++# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
+++# This arrangement optimizes the inner loop FIFOs at the expense of making the
+++# bulk shift between loops quite a bit nastier
+++# a8 used as temp
+++
+++  sub.setf -, r3, ra31
+++  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
+++  brr.anyn -, r:uvloop_b0
+++  mov ra5, rb4          ; mov rb4, ra4
+++  mov ra4, rb5          ; mov rb5, ra6
+++  mov ra6, rb7          ; mov rb7, ra8
+++# >>>
+++
+++# 1st half done all results now in the a/b4..7 fifo
+++
+++# Need to bulk rotate FIFO for heights other than 16
+++# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with
+++# we are allowed 3/4 cb_size w/h :-(
+++
+++# Destination uniforms discarded
+++# At the end drop through to _b - we will always do b after b0
+++
+++  sub.setf -, 15, r3    # 12 + 3 of preroll
+++  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
+++  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
+++  mov r0, i_shift16     ; mov ra_link, unif
+++  mov r1, 0x10000
+++# >>>
+++  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
+++# If h != 16 && h != 12 then h <= 8 so
+++# shift 8 with discard (.16b = .16a on all regs)
+++  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
+++  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
+++  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
+++# >>>
+++  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
+++
+++  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
+++# Shift 4
+++  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
+++  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
+++  # If we shifted by 4 here then the max length remaining is 4
+++  # so that is it
+++
+++  brr -, r:uv_b0_post_fin
+++# Shift 2
+++  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
+++  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
+++  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
+++  # 6 / 2 so need 6 outputs
+++# >>>
+++
+++:uv_b0_post12
+++# this one is annoying as we need to swap halves of things that don't
+++# really want to be swapped
+++
+++# b7a, a6a, b5a, a4a
+++# b4a, a5a, b6a, a7a
+++# b7b, a6b, b5b, a4b
+++# b4b, a5b, b6b, a7b
+++
+++  mov r2, ra6           ; mov r3, rb7
+++  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
+++  mov ra5, r2           ; mov rb4, r3
+++
+++  mov r2,  ra4          ; mov r3,  rb5
+++  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
+++  mov ra7, r2           ; mov rb6, r3
+++
+++:uv_b0_post_fin
+++
+++##### L1 B processing
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +
+-+# set up VPM write
+-+mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+-+
+ +# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0                      ; mov ra_y_next, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
+-+# compute offset from frame base u to frame base v
+-+sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
+-+add r0, r0, r3                     ; mov ra1, unif       # width_height
+-+and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
+++  mov ra2, unif         ; mov r0, elem_num
+ +
+-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+-+add rb17, ra1.16a, 1
+-+add rb18, ra1.16a, 3
+-+shl r0,   ra1.16a, 7
+++  setf_nz_if_v                                  # Also acts as delay slot for ra2
+ +
+-+add ra_frame_base_next, rb_x_next, r2
+++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+++  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+++  min r0, r0, rb_max_x  ; mov -, unif           # ; width_height
+ +
+-+# r0 is currently height<<7
+-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+-+shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
+-+shr r3, r3, 8
+-+add vr_setup, r3, rb21
+++  shl rb_xshift2_next, r0, 4
+ +
+-+add r0, r0, ra1.16b    # Combine width and height of destination area
+-+shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+++  add r0, r0, r0        ; mov ra_y2_next, ra2.16a
+++  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
+++  add rb_base2_next, r3, r0
+ +
+-+# get filter coefficients
+++  mov ra1, unif         ; mov rb9,  ra3.8b      # U offset/weight
+++  mov.ifnz ra1, unif    ; mov rb10, ra3.8c      # V offset/weight
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# Get offset & weight stuff
+-+
+-+# The unif read occurs unconditionally, only the write is conditional
+-+mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
+-+mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
+-+nop                 ; mov rb10, ra3.8c
+-+mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
+-+
+-+shl r1, ra1.16b, rb13
+-+asr rb12, r1, 1
+++  nop                   ; mov rb11, ra3.8d
+++  shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3     # ; r3 (loop counter)  = 0
+++  asr rb12, r1, 1
+ +
+ +# ra1.16a used directly in the loop
+ +
+@@ -13882,125 +17060,147 @@ index 0000000..aa9e1e7
+ +# then submit two more texture requests
+ +
+ +# r3 = 0
+++
+ +:uvloop_b
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1     # loop counter increment
+++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
+++  shr r1, r0, 8         ; mov.ifnz r3, ra_y2
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+-+add t1s, ra_frame_base, r2
+++  max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
+++  min r2, r2, rb_max_y
+++  add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
+++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+ +
+ +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+nop                  ; mul24      r3, ra0.8a,       r0
+-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+sub r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+-+brr.anyn -, r:uvloop_b
+-+mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+mov ra14, ra15
+-+mov ra15, r0            ; mul24 r0, ra12, rb8
+++  and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
+++  nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
+++  sub r0, r2, r3       ; mov r3, rb31
+++  sub.setf -, r3, 4    ; mov ra12, ra13
+++  brr.anyn -, r:uvloop_b
+++  mov ra13, ra14          ; mul24 r1, ra14, rb9
+++  mov ra14, ra15          ; mul24 r2, ra15, rb10
+++  mov ra15, r0            ; mul24 r0, ra12, rb8
+ +# >>> .anyn uvloop_b
+ +
+ +# apply vertical filter and write to VPM
+ +
+-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
+-+sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+asr r1, r1, 14          # shift2=6
+-+
+-+asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+-+nop                     ; mul24 r0, r0, rb14
+-+
+-+add r1, r1, r0          ; mov -, vw_wait
+-+shl r1, r1, 8           # Lose bad top 8 bits & sign extend
+++  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
+++  add r1, r1, r2        ; mul24 r0, ra15, rb11
+++  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
+++  mov ra7, rb6          ; mul24 r1, r1, ra_k256
+++  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
+ +
+-+add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+++  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
+++  add r1, r1, r0        ; mov rb4, ra4
+ +
+-+brr.anyn -, r:uvloop_b
+-+asr r1, r1, rb13         # Delay 1
+-+min r1, r1, rb_k255       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+++  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
+++  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+ +
+++  sub.setf -, r3, ra31  ; mov ra6, rb7
+++  asr ra3.8as, r1, rb13
+++  nop                   ; mov r1, r1 << 8
+++  brr.anyn -, r:uvloop_b
+++  asr ra3.8bs, r1, rb13
+++  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
+++  mov vpm, ra3
+++# >>>
+ +
+-+# DMA out for U
+-+
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+++# DMA out
+ +
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+++  bra -, ra_link
+++  mov vw_setup, rb26
+++  mov vw_setup, rb29
+++  mov vw_addr, unif     # c_dst_addr
+ +
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+ +
+ +################################################################################
+ +
+ +# mc_exit()
+ +
+-+::mc_exit
+-+mov  -, vw_wait # wait on the VDW
+++::mc_interrupt_exit8c
+++  ldtmu0
+++  ldtmu1
+++  ldtmu1
+++  mov  -, vw_wait ; nop ; ldtmu0  # wait on the VDW
+++
+++  mov -,sacq(0) # 1
+++  mov -,sacq(0) # 2
+++  mov -,sacq(0) # 3
+++  mov -,sacq(0) # 4
+++  mov -,sacq(0) # 5
+++  mov -,sacq(0) # 6
+++  mov -,sacq(0) # 7
+++#  mov -,sacq(0) # 8
+++#  mov -,sacq(0) # 9
+++#  mov -,sacq(0) # 10
+++#  mov -,sacq(0) # 11
+ +
+-+mov -,srel(0)
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+ +
+-+ldtmu0
+-+ldtmu1
+-+ldtmu0
+-+ldtmu1
+++# Chroma & Luma the same now
+++::mc_exit_c
+++::mc_exit
+++  ldtmu0
+++  ldtmu1
+++  ldtmu0
+++  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
+ +
+-+nop        ; nop ; thrend
+-+nop        ; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+++  mov -,srel(0)
+ +
+-+# mc_interrupt_exit8()
+-+::mc_interrupt_exit8
+-+mov  -, vw_wait # wait on the VDW
+++  nop        ; nop ; thrend
+++  nop        ; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+ +
+-+ldtmu0
+-+ldtmu1
+-+ldtmu0
+-+ldtmu1
+ +
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+mov -,sacq(0) # 4
+-+mov -,sacq(0) # 5
+-+mov -,sacq(0) # 6
+-+mov -,sacq(0) # 7
+++# mc_interrupt_exit12()
+++::mc_interrupt_exit12
+++  ldtmu0
+++  ldtmu1
+++  ldtmu0
+++  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
+ +
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+++  mov -,sacq(0) # 1
+++  mov -,sacq(0) # 2
+++  mov -,sacq(0) # 3
+++  mov -,sacq(0) # 4
+++  mov -,sacq(0) # 5
+++  mov -,sacq(0) # 6
+++  mov -,sacq(0) # 7
+++  mov -,sacq(0) # 8
+++  mov -,sacq(0) # 9
+++  mov -,sacq(0) # 10
+++  mov -,sacq(0) # 11
+ +
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+ +
+ +
+++::mc_exit1
+++  mov  -, vw_wait # wait on the VDW
+ +
+++  ldtmu0
+++  ldtmu1
+++  ldtmu0
+++  ldtmu1
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+ +
+ +# LUMA CODE
+ +
+@@ -14010,116 +17210,104 @@ index 0000000..aa9e1e7
+ +################################################################################
+ +# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+ +::mc_setup
+-+  mov r3, 16
+-+
+ +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+-+  mov ra8, unif  # y_x
+-+  mov ra9, unif  # ref_y_base
+-+  mov ra10, unif # y2_x2
+-+  mov ra11, unif # ref_y2_base
+++  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
+++  mov ra9, unif         # ref_y_base
+++  mov ra10, unif        # y2_x2
+++  mov ra11, unif        # ref_y2_base
+ +
+ +# Read image dimensions
+-+  mov r1, unif # width_height
+-+  shl r0,r1,r3
+-+  asr r1,r1,r3 # width
+-+  asr r0,r0,r3 # height
+-+  sub rb_frame_width_minus_1,r1,1
+-+  sub rb_frame_height_minus_1,r0,1
+-+
+-+# get source pitch
+-+  mov rb_pitch, unif # src_pitch
+++  mov ra3, unif         # width_height
+++  mov rb_xpitch, unif   # stride2
+++  sub rb_max_x, ra3.16b, 1
+++  sub rb_max_y, ra3.16a, 1
+++  mov rb_pitch, unif    # stride1
+ +
+ +# get destination pitch
+-+  mov r0, unif       # dst_pitch
+ +  mov r1, vdw_setup_1(0)
+-+  add rb24, r1, r0
+++  or  rb24, r1, rb_pitch
+ +
+ +# Compute base address for first and second access
+-+  mov r1, ra8 # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+++  mov r3, elem_num
+++  add r0, ra8.16a, r3   # Load x + elem_num
+ +  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
+++  min r0, r0, rb_max_x
+ +  shl ra_xshift_next, r0, 3 # Compute shifts
+-+  add ra_y, r1, 1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+++
+++
+++# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
+++
+++  and r0, r0, -4        ; v8subs r2, r2, r2
+++  sub r2, r2, rb_pitch
+++  and r1, r0, r2
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        # Add stripe offsets
+++  add ra_base, ra9, r0
+++
+++  mov r1, ra8.16b       # Load y
+++  add ra_y, r1, 1       # Set for next
+ +  max r1, r1, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+  add t0s, r2, r1 ; mov ra_frame_base, r2
+-+
+-+  mov r1, ra10 # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+++  min r1, r1, rb_max_y
+++
+++# submit texture requests for first line
+++  nop                   ; mul24 r1, r1, rb_pitch
+++  add t0s, ra_base, r1
+++
+++
+++  # r3 still contains elem_num
+++  add r0, ra10.16a, r3  # Load x
+ +  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
+-+  shl rx_xshift2_next, r0, 3 # Compute shifts
+-+  add ra_y2, r1, 1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+++  min r0, r0, rb_max_x
+++  shl rb_xshift2_next, r0, 3 # Compute shifts
+++
+++  # r2 still contains mask
+++  and r0, r0, -4
+++  and r1, r0, r2
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        # Add stripe offsets
+++  add ra_base2, ra11, r0
+++
+++  mov r1, ra10.16b       # Load y
+++  add ra_y2, r1, 1       # Set for next
+ +  max r1, r1, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+  add t1s, r2, r1 ; mov ra_frame_base2, r2
+++  min r1, r1, rb_max_y
+ +
+++# submit texture requests for first line
+++  nop                   ; mul24 r1, r1, rb_pitch
+++  add t1s, ra_base2, r1
+ +
+ +# load constants
+ +
+ +  mov ra_k1, 1
+ +  mov ra_k256, 256
+-+  mov ra30, 64
+-+
+-+  mov rb20, 0xffffff00
+ +  mov rb_k255, 255
+-+  mov rb23, 24
+++  mov ra_k0, 0
+ +
+ +# touch vertical context to keep simulator happy
+ +
+-+  mov ra8, 0
+-+  mov ra9, 0
+-+  mov ra10, 0
+-+  mov ra11, 0
+-+  mov ra12, 0
+-+  mov ra13, 0
+-+  mov ra14, 0
+-+  mov ra15, 0
+++  mov ra8,  0           ; mov rb8,  0
+++  mov ra9,  0           ; mov rb9,  0
+++  mov ra10, 0           ; mov rb10, 0
+++  mov ra11, 0           ; mov rb11, 0
+ +
+ +# Compute part of VPM to use
+-+  mov r2, qpu_num
+-+  mov r1, r2
+-+  asr r1, r1, 2
+-+  shl r1, r1, 6
+-+  mov r0, r2
+-+  and r0, r0, 3
+-+  add r0, r0, r1
+-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+  add rb28, r0, r1  # VPM for saving data
+-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+  shl r0, r0, 5
+-+  add rb27, r0, r1  # Command for dma output
+++  m_calc_dma_regs rb28, rb27
+ +
+ +# Weighted prediction denom
+-+  add rb13, unif, 9  # unif = weight denom + 6
+-+
+-+  mov -, unif # Unused
+++  add rb13, unif, 9     # unif = weight denom + 6
+ +
+ +# submit texture requests for second line
+ +  max r1, ra_y, 0
+-+  min r1, r1, rb_frame_height_minus_1
+++  min r1, r1, rb_max_y
+ +  add ra_y, ra_y, 1
+-+  nop ; mul24 r1, r1, rb_pitch
+-+  add t0s, r1, ra_frame_base
+++  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
+++  add t0s, r1, ra_base
+ +
+ +  max r1, ra_y2, 0
+-+  min r1, r1, rb_frame_height_minus_1
+++  min r1, r1, rb_max_y
+ +  add ra_y2, ra_y2, 1
+-+  nop ; mul24 r1, r1, rb_pitch
+-+  add t1s, r1, ra_frame_base2
+++  nop                   ; mul24 r1, r1, rb_pitch
+++  add t1s, r1, ra_base2
+ +
+ +# FALL THROUGHT TO PER-BLOCK SETUP
+ +
+@@ -14127,47 +17315,63 @@ index 0000000..aa9e1e7
+ +# P and B blocks share the same setup code to save on Icache space
+ +:per_block_setup
+ +  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov ra31, unif
+++  mov ra_link, unif
+++#### We do all the setup even if we are about to exit - reading junk from unif....
+ +
+-+  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
+++  mov ra1, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +  mov ra_xshift, ra_xshift_next
+-+  mov rx_xshift2, rx_xshift2_next
+++  mov rb_xshift2, rb_xshift2_next
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+ +
+-+  add r0, ra1.16a, r1 # Load x
+++  add r0, ra1.16a, r3   # Load x
+ +  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+  shl ra_xshift_next, r0, 3 # Compute shifts
+-+  mov r3, 8                          ; mov ra_y_next, ra1.16b
+-+  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
+-+  add ra_frame_base_next, r2, r0
+-+
+-+  add r0, ra1.16a, r1 # Load x
+++  min r0, r0, rb_max_x
+++
+++  shl ra_xshift_next, r0, 3         # Compute shifts
+++  and r0, r0, -4        ; v8subs r2, r2, r2
+++  sub r2, r2, rb_pitch
+++  and r1, r0, r2
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        # Add stripe offsets
+++  add ra_base_next, unif, r0              # Base1
+++  mov ra_y_next, ra1.16b                      # Load y
+++  mov ra1, unif         # x2_y2
+++  nop                   # ra1 delay
+++
+++  add r0, ra1.16a, r3   # Load x2
+ +  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+  shl rx_xshift2_next, r0, 3         # Compute shifts
+-+  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
+-+  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
+-+  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
+++  min r0, r0, rb_max_x
+++
+++  shl rb_xshift2_next, r0, 3         # Compute shifts
+++  and r0, r0, -4
+++  and r1, r0, r2
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        # Add stripe offsets
+++  add rb_base2_next, unif, r0              # Base1
+++  mov ra_y2_next, ra1.16b                      # Load y
+++  mov ra_width_height, unif         # width_height
+ +
+ +# set up VPM write
+-+  mov vw_setup, rb28
+++  mov vw_setup, rb28    # [ra1 delay]
+ +
+ +# get width,height of block (unif load above)
+-+  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+-+  add rb17, ra1.16a, 5
+-+  add rb18, ra1.16a, 7
+-+  shl r0,   ra1.16a, 7
+-+  add r0,   r0, ra1.16b # Combine width and height of destination area
+-+  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+++  sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width)
+++  add rb17, ra_height, 5  ; mov r0, ra_height
+++  mov r1, 16
+++  min r0, r0, r1
+++  add rb18, r0, 7
+++  shl r0,   r0, 7
+++  add r0,   r0, ra_width                        # Combine width and height of destination area
+++  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
+ +  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
+ +
+ +# get filter coefficients and discard unused B frame values
+-+  shl.ifz r0, r0, i_shift16      # Pick half to use
+-+  shl ra8, r0, 3
+++  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
+++  mov r2, 0x01040400                 # [ra5 delay]
+++  shl ra8, r0, 3                     ; mov rb14, ra5.16a
+ +
+ +# Pack the 1st 4 filter coefs for H & V tightly
+ +
+@@ -14175,9 +17379,8 @@ index 0000000..aa9e1e7
+ +  ror ra2.8a, r1, ra8.8d
+ +  ror ra0.8a, r1, ra8.8c
+ +
+-+  mov r1,0x01040400
+-+  ror ra2.8b, r1, ra8.8d
+-+  ror ra0.8b, r1, ra8.8c
+++  ror ra2.8b, r2, ra8.8d
+++  ror ra0.8b, r2, ra8.8c
+ +
+ +  mov r1,0x050b0a00  # -ve
+ +  ror ra2.8c, r1, ra8.8d
+@@ -14203,37 +17406,44 @@ index 0000000..aa9e1e7
+ +  ror ra3.8c, r1, ra8.8d
+ +  ror ra1.8c, r1, ra8.8c
+ +
+++  mov r1,0x01010000  # -ve
+++  ror ra3.8d, r1, ra8.8d
+++  ror ra1.8d, r1, ra8.8c
+++
+ +# Extract weighted prediction information in parallel
+++# We are annoyingly A src limited here
+ +
+-+  mov r1,0x01010000  # -ve
+-+  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
+-+  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
+-+
+-+# r3 = 16 from (long way) above
+-+  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
+-+  asr ra18, r0, r3          ; mov rb5, ra3.8b
+-+  bra -, ra31
+-+  shl r0, r0, r3            ; mov rb6, ra3.8c
+-+  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
+-+  asr rb12, r1, 9
+-+
+-+# >>> branch ra31
+++  mov rb4, ra3.8a            ; mov ra18, unif
+++  mov rb5, ra3.8b
+++  mov rb6, ra3.8c
+++  mov.ifnz ra5, ra18
+++
+++  mov rb_dest, unif     # Destination address
+++
+++  bra -, ra_link
+++
+++  shl r0, ra5.16b, rb13      # Offset calc
+++  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+++  mov r3, 0                  ; mov rb7, ra3.8d
+++# >>> branch ra_link
+ +#
+ +# r3 = 0
+-+# ra18 = weight L1
+-+# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
+-+# rb13 = weight denom + 6 + 9
+-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++# ra18.16a = weight L1
+++# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
+++# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++# rb13     = weight denom + 6 + 9
+++# rb14     = weight L0
+ +
+ +
+ +################################################################################
+-+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+ +# In a P block, y2_x2 should be y_x+8
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +
+ +::mc_filter
+-+# r0 = weight << 16; We want weight * 2 in rb14
+-+  asr rb14, r0, 15
+++# ra5.16a = weight << 16; We want weight * 2 in rb14
+++
+++  shl rb14, ra5.16a, 1
+ +
+ +# r3 = 0
+ +
+@@ -14249,20 +17459,20 @@ index 0000000..aa9e1e7
+ +# might be B where y != y2 so we must do full processing on both y and y2
+ +
+ +  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+++  mov.ifz ra_base, ra_base_next ; mov rb31, r3
+ +  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+ +
+ +  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+++  min r2, r2, rb_max_y
+ +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+++  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+ +
+ +  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+++  min r2, r2, rb_max_y
+ +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+@@ -14271,21 +17481,21 @@ index 0000000..aa9e1e7
+ +
+ +# apply horizontal filter
+ +  nop                  ; mul24      r3, ra0.8a,      r0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+ +  sub r0, r2, r3       ; mov r3, rb31
+ +
+ +  sub.setf -, r3, 8       ; mov r1,   ra8
+@@ -14324,18 +17534,48 @@ index 0000000..aa9e1e7
+ +  max vpm, r1, 0         # Delay 3
+ +# >>> branch.anyn yloop
+ +
+-+# DMA out
+++# If looping again the we consumed 16 height last loop
+++  # rb29 (stride) remains constant
+++  # rb17 remains const (based on total height)
+++  # recalc rb26, rb18 based on new segment height
+++  # N.B. r3 is loop counter still
+++
+++  mov r1, 16
+++  sub r0, ra_height, r1
+++  mov ra_height, r0
+++  max.setf r0, r0, 0    # Done if Z now
+ +
+-+  brr -, r:per_block_setup
+++# DMA out
+++  brr.anyz -, r:per_block_setup
+ +  mov vw_setup, rb26 # VDW setup 0    Delay 1
+ +  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, unif # start the VDW   Delay 3
+++  mov vw_addr, rb_dest # start the VDW   Delay 3
+++# >>> .anyz per_block_setup
+++
+++  min r0, r0, r1
+++  add rb18, rb18, r0
+++  sub r0, r0, r1
+++  shl r0, r0, i_shift23
+++  add rb26, rb26, r0
+++
+++  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
+++  add rb_dest, rb_dest, r0
+++
+++  mov vw_setup, rb28    # Reset our VDM write pointer
+++
+++  brr -, r:yloop
+++  nop
+++  nop
+++  nop
+++# >>>
+++
+++
+ +
+ +
+ +
+ +################################################################################
+ +
+-+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+ +# In a P block, only the first half of coefficients contain used information.
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+@@ -14347,7 +17587,7 @@ index 0000000..aa9e1e7
+ +
+ +::mc_filter_b
+ +  # r0 = weightL0 << 16, we want it in rb14
+-+  asr rb14, r0, i_shift16
+++#  asr rb14, r0, i_shift16
+ +
+ +:yloopb
+ +# retrieve texture results and pick out bytes
+@@ -14357,20 +17597,20 @@ index 0000000..aa9e1e7
+ +# Perhaps we could add on the pitch and clip using larger values?
+ +
+ +  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+++  mov.ifz ra_base, ra_base_next ; mov rb31, r3
+ +  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+ +
+ +  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+++  min r2, r2, rb_max_y
+ +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+++  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+ +
+ +  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+++  min r2, r2, rb_max_y
+ +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+@@ -14379,21 +17619,21 @@ index 0000000..aa9e1e7
+ +
+ +# apply horizontal filter
+ +  nop                  ; mul24      r3, ra0.8a,      r0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+ +  sub r0, r2, r3       ; mov r3, rb31
+ +
+ +  sub.setf -, r3, 8       ; mov r1,   ra8
+@@ -14405,7 +17645,6 @@ index 0000000..aa9e1e7
+ +  # >>> .anyn yloopb
+ +
+ +  # apply vertical filter and write to VPM
+-+
+ +  nop                     ; mul24 r0, rb8,  ra2.8a
+ +  nop                     ; mul24 r1, rb9,  ra2.8b
+ +  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+@@ -14421,558 +17660,174 @@ index 0000000..aa9e1e7
+ +
+ +  asr r1, r1, 14
+ +  nop                     ; mul24 r0, r1, rb14
+-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+-+
+-+  add r1, r1, r0          ; mov -, vw_wait
+-+  shl r1, r1, 8
+-+
+-+  brr.anyn -, r:yloopb
+-+  asr r1, r1, rb13         # Delay 1
+-+  min r1, r1, rb_k255       # Delay 2
+-+  max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out
+-+  brr -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, unif # start the VDW   Delay 3
+-+
+-+################################################################################
+-+
+-+# mc_interrupt_exit12()
+-+::mc_interrupt_exit12
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  # Dummy wait to test instructions
+-+#  mov r3,1000000
+-+#:dummy_loop
+-+#  sub.setf r3, r3, 1
+-+#  nop
+-+#  nop
+-+#  brr.anynn -, r:dummy_loop
+-+#  nop
+-+#  nop
+-+#  nop
+-+
+-+  ldtmu0
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu1
+-+
+-+  mov -,sacq(0) # 1
+-+  mov -,sacq(0) # 2
+-+  mov -,sacq(0) # 3
+-+  mov -,sacq(0) # 4
+-+  mov -,sacq(0) # 5
+-+  mov -,sacq(0) # 6
+-+  mov -,sacq(0) # 7
+-+  mov -,sacq(0) # 8
+-+  mov -,sacq(0) # 9
+-+  mov -,sacq(0) # 10
+-+  mov -,sacq(0) # 11
+-+
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+-+
+-+
+-+::mc_exit1
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  ldtmu1
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+++  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
+ +
+++  add r1, r1, r0          ; mov -, vw_wait
+++  shl r1, r1, 8
+ +
+-+::mc_end
+-+# Do not add code here because mc_end must appear after all other code.
+-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+-new file mode 100644
+-index 0000000..db41a4d
+---- /dev/null
+-+++ b/libavcodec/rpi_user_vcsm.h
+-@@ -0,0 +1,459 @@
+-+/*****************************************************************************
+-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
+-+*
+-+* This program is the proprietary software of Broadcom Corporation and/or
+-+* its licensors, and may only be used, duplicated, modified or distributed
+-+* pursuant to the terms and conditions of a separate, written license
+-+* agreement executed between you and Broadcom (an "Authorized License").
+-+* Except as set forth in an Authorized License, Broadcom grants no license
+-+* (express or implied), right to use, or waiver of any kind with respect to
+-+* the Software, and Broadcom expressly reserves all rights in and to the
+-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
+-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
+-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
+-+* THE SOFTWARE.
+-+*
+-+* Except as expressly set forth in the Authorized License,
+-+* 1. This program, including its structure, sequence and organization,
+-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
+-+*    all reasonable efforts to protect the confidentiality thereof, and to
+-+*    use this information only in connection with your use of Broadcom
+-+*    integrated circuit products.
+-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
+-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
+-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
+-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
+-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
+-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
+-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
+-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
+-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
+-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
+-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
+-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
+-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
+-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
+-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
+-+*****************************************************************************/
+-+
+-+#ifndef __USER_VCSM__H__INCLUDED__
+-+#define __USER_VCSM__H__INCLUDED__
+-+
+-+/* VideoCore Shared Memory - user interface library.
+-+**
+-+** This library provides all the necessary abstraction for any application to
+-+** make use of the shared memory service which is distributed accross a kernel
+-+** driver and a videocore service.
+-+**
+-+** It is an application design decision to choose or not to use this service.
+-+**
+-+** The logical flow of operations that a user application needs to follow when
+-+** using this service is:
+-+**
+-+**       1) Initialize the service.
+-+**       2) Allocate shared memory blocks.
+-+**       3) Start using the allocated blocks.
+-+**          - In order to gain ownership on a block, lock the allocated block,
+-+**            locking a block returns a valid address that the user application
+-+**            can access.
+-+**          - When finished with using the block for the current execution cycle
+-+**            or function, and so when giving up the ownership, unlock the block.
+-+**       4) A block can be locked/unlocked as many times required - within or outside
+-+**          of - a specific execution context.
+-+**       5) To completely release an allocated block, free it.
+-+**       6) If the service is no longer required, terminate it.
+-+**
+-+**
+-+** Some generic considerations:
+-+
+-+** Allocating memory blocks.
+-+**
+-+**   Memory blocks can be allocated in different manners depending on the cache
+-+**   behavior desired.  A given block can either be:
+-+
+-+**       - Allocated in a non cached fashion all the way through host and videocore.
+-+**       - Allocated in a cached fashion on host OR videocore.
+-+**       - Allocated in a cached fashion on host AND videocore.
+-+**
+-+**   It is an application decision to determine how to allocate a block.  Evidently
+-+**   if the application will be doing substantial read/write accesses to a given block,
+-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
+-+**   better results.
+-+**
+-+**
+-+** Locking memory blocks.
+-+**
+-+**   When the memory block has been allocated in a host cached fashion, locking the
+-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
+-+**
+-+**   For the above reason and when using host cached allocation, it is important that
+-+**   an application properly implements the lock/unlock mechanism to ensure cache will
+-+**   stay coherent, otherwise there is no guarantee it will at all be.
+-+**
+-+**   It is possible to dynamically change the host cache behavior (ie cached or non
+-+**   cached) of a given allocation without needing to free and re-allocate the block.
+-+**   This feature can be useful for such application which requires access to the block
+-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
+-+**   the application can optimize performances for a given duration of use.
+-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
+-+**   cache.  If one requires to change the videocore cache behavior, then a new block
+-+**   must be created to replace the old one.
+-+**
+-+**   On successful locking, a valid pointer is returned that the application can use
+-+**   to access to data inside the block.  There is no guarantee that the pointer will
+-+**   stay valid following the unlock action corresponding to this lock.
+-+**
+-+**
+-+** Unocking memory blocks.
+-+**
+-+**   When the memory block has been allocated in a host cached fashion, unlocking the
+-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
+-+**   explicitely asked not to flush the cache for performances reasons.
+-+**
+-+**   For the above reason and when using host cached allocation, it is important that
+-+**   an application properly implements the lock/unlock mechanism to ensure cache will
+-+**   stay coherent, otherwise there is no guarantee it will at all be.
+-+**
+-+**
+-+** A complete API is defined below.
+-+*/
+++  brr.anyn -, r:yloopb
+++  asr r1, r1, rb13         # Delay 1
+++  min r1, r1, rb_k255       # Delay 2
+++  max vpm, r1, 0         # Delay 3
+ +
+-+#ifdef __cplusplus
+-+extern "C"
+-+{
+-+#endif
+ +
+-+/* Different status that can be dumped.
+-+*/
+-+typedef enum
+-+{
+-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
+-+                                    // Result of the walk is seen in the videocore
+-+                                    // log.
+-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
+-+                                    // driver (ie for all processes).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
+-+                                    // driver (for current process).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
+-+                                    // driver (for current process).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
+-+                                    // VCSM_STATUS_HOST_WALK_MAP.
+-+                                    //
+-+   VCSM_STATUS_NONE,                // Must be last - invalid.
+-+
+-+} VCSM_STATUS_T;
+-+
+-+/* Different kind of cache behavior.
+-+*/
+-+typedef enum
+-+{
+-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
+-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
+-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
+-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
+-+
+-+} VCSM_CACHE_TYPE_T;
+-+
+-+/* Initialize the vcsm processing.
+-+**
+-+** Must be called once before attempting to do anything else.
+-+**
+-+** Returns 0 on success, -1 on error.
+-+*/
+-+int vcsm_init( void );
+++# If looping again the we consumed 16 height last loop
+++  # rb29 (stride) remains constant
+++  # rb17 remains const (based on total height)
+++  # recalc rb26, rb18 based on new segment height
+++  # N.B. r3 is loop counter still
+ +
+++  mov r1, 16
+++  sub r0, ra_height, r1
+++  mov ra_height, r0
+++  max.setf r0, r0, 0    # Done if Z now
+ +
+-+/* Terminates the vcsm processing.
+-+**
+-+** Must be called vcsm services are no longer needed, it will
+-+** take care of removing any allocation under the current process
+-+** control if deemed necessary.
+-+*/
+-+void vcsm_exit( void );
+++# DMA out
+++  brr.anyz -, r:per_block_setup
+++  mov vw_setup, rb26 # VDW setup 0    Delay 1
+++  mov vw_setup, rb29 # Stride         Delay 2
+++  mov vw_addr, rb_dest # start the VDW   Delay 3
+++# >>> .anyz per_block_setup
+ +
+++  min r0, r0, r1
+++  add rb18, rb18, r0
+++  sub r0, r0, r1
+++  shl r0, r0, i_shift23
+++  add rb26, rb26, r0
+ +
+-+/* Queries the status of the the vcsm.
+-+**
+-+** Triggers dump of various kind of information, see the
+-+** different variants specified in VCSM_STATUS_T.
+-+**
+-+** Pid is optional.
+-+*/
+-+void vcsm_status( VCSM_STATUS_T status, int pid );
+-+
+-+
+-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
+-+** allocator.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc( unsigned int size, char *name );
+-+
+-+
+-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
+-+** allocator, the type of caching requested is passed as argument of the
+-+** function call.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
+-+
+-+
+-+/* Shares an allocated block of memory via the vcsm memory allocator.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc_share( unsigned int handle );
+-+
+-+
+-+/* Resizes a block of memory allocated previously by vcsm_alloc.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** The handle must be unlocked by user prior to attempting any
+-+** resize action.
+-+**
+-+** On error, the original size allocated against the handle
+-+** remains available the same way it would be following a
+-+** successful vcsm_malloc.
+-+*/
+-+int vcsm_resize( unsigned int handle, unsigned int new_size );
+-+
+-+
+-+/* Frees a block of memory that was successfully allocated by
+-+** a prior call the vcms_alloc.
+-+**
+-+** The handle should be considered invalid upon return from this
+-+** call.
+-+**
+-+** Whether any memory is actually freed up or not as the result of
+-+** this call will depends on many factors, if all goes well it will
+-+** be freed.  If something goes wrong, the memory will likely end up
+-+** being freed up as part of the vcsm_exit process.  In the end the
+-+** memory is guaranteed to be freed one way or another.
+-+*/
+-+void vcsm_free( unsigned int handle );
+-+
+-+
+-+/* Retrieves a videocore opaque handle from a mapped user address
+-+** pointer.  The videocore handle will correspond to the actual
+-+** memory mapped in videocore.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** Note: the videocore opaque handle is distinct from the user
+-+**       opaque handle (allocated via vcsm_malloc) and it is only
+-+**       significant for such application which knows what to do
+-+**       with it, for the others it is just a number with little
+-+**       use since nothing can be done with it (in particular
+-+**       for safety reason it cannot be used to map anything).
+-+*/
+-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
+-+
+-+
+-+/* Retrieves a videocore opaque handle from a opaque handle
+-+** pointer.  The videocore handle will correspond to the actual
+-+** memory mapped in videocore.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** Note: the videocore opaque handle is distinct from the user
+-+**       opaque handle (allocated via vcsm_malloc) and it is only
+-+**       significant for such application which knows what to do
+-+**       with it, for the others it is just a number with little
+-+**       use since nothing can be done with it (in particular
+-+**       for safety reason it cannot be used to map anything).
+-+*/
+-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
+++  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
+++  add rb_dest, rb_dest, r0
+ +
+++  mov vw_setup, rb28    # Reset our VDM write pointer
+ +
+-+/* Retrieves a user opaque handle from a mapped user address
+-+** pointer.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+*/
+-+unsigned int vcsm_usr_handle( void *usr_ptr );
+-+
+-+
+-+/* Retrieves a mapped user address from an opaque user
+-+** handle.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero address on success.
+-+**
+-+** On success, the address corresponds to the pointer
+-+** which can access the data allocated via the vcsm_malloc
+-+** call.
+-+*/
+-+void *vcsm_usr_address( unsigned int handle );
+-+
+-+
+-+/* Locks the memory associated with this opaque handle.
+-+**
+-+** Returns:        NULL on error
+-+**                 a valid pointer on success.
+-+**
+-+** A user MUST lock the handle received from vcsm_malloc
+-+** in order to be able to use the memory associated with it.
+-+**
+-+** On success, the pointer returned is only valid within
+-+** the lock content (ie until a corresponding vcsm_unlock_xx
+-+** is invoked).
+-+*/
+-+void *vcsm_lock( unsigned int handle );
+-+
+-+
+-+/* Locks the memory associated with this opaque handle.  The lock
+-+** also gives a chance to update the *host* cache behavior of the
+-+** allocated buffer if so desired.  The *videocore* cache behavior
+-+** of the allocated buffer cannot be changed by this call and such
+-+** attempt will be ignored.
+-+**
+-+** The system will attempt to honour the cache_update mode request,
+-+** the cache_result mode will provide the final answer on which cache
+-+** mode is really in use.  Failing to change the cache mode will not
+-+** result in a failure to lock the buffer as it is an application
+-+** decision to choose what to do if (cache_result != cache_update)
+-+**
+-+** The value returned in cache_result can only be considered valid if
+-+** the returned pointer is non NULL.  The cache_result pointer may be
+-+** NULL if the application does not care about the actual outcome of
+-+** its action with regards to the cache behavior change.
+-+**
+-+** Returns:        NULL on error
+-+**                 a valid pointer on success.
+-+**
+-+** A user MUST lock the handle received from vcsm_malloc
+-+** in order to be able to use the memory associated with it.
+-+**
+-+** On success, the pointer returned is only valid within
+-+** the lock content (ie until a corresponding vcsm_unlock_xx
+-+** is invoked).
+-+*/
+-+void *vcsm_lock_cache( unsigned int handle,
+-+                       VCSM_CACHE_TYPE_T cache_update,
+-+                       VCSM_CACHE_TYPE_T *cache_result );
+-+
+-+
+-+/* Unlocks the memory associated with this user mapped address.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking a mapped address, the user should no longer
+-+** attempt to reference it.
+-+*/
+-+int vcsm_unlock_ptr( void *usr_ptr );
+-+
+-+
+-+/* Unlocks the memory associated with this user mapped address.
+-+** Apply special processing that would override the otherwise
+-+** default behavior.
+-+**
+-+** If 'cache_no_flush' is specified:
+-+**    Do not flush cache as the result of the unlock (if cache
+-+**    flush was otherwise applicable in this case).
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking a mapped address, the user should no longer
+-+** attempt to reference it.
+-+*/
+-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
+++  brr -, r:yloopb
+++  nop
+++  nop
+++  nop
+ +
+++################################################################################
+ +
+-+/* Unlocks the memory associated with this user opaque handle.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking an opaque handle, the user should no longer
+-+** attempt to reference the mapped addressed once associated
+-+** with it.
+-+*/
+-+int vcsm_unlock_hdl( unsigned int handle );
+-+
+-+
+-+/* Unlocks the memory associated with this user opaque handle.
+-+** Apply special processing that would override the otherwise
+-+** default behavior.
+-+**
+-+** If 'cache_no_flush' is specified:
+-+**    Do not flush cache as the result of the unlock (if cache
+-+**    flush was otherwise applicable in this case).
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking an opaque handle, the user should no longer
+-+** attempt to reference the mapped addressed once associated
+-+** with it.
+-+*/
+-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+-+
+-+/* Clean and/or invalidate the memory associated with this user opaque handle
+-+**
+-+** Returns:        non-zero on error
+-+**
+-+** structure contains a list of flush/invalidate commands. Commands are:
+-+** 0: nop
+-+** 1: invalidate       given virtual range in L1/L2
+-+** 2: clean            given virtual range in L1/L2
+-+** 3: clean+invalidate given virtual range in L1/L2
+-+** 4: flush all L1/L2
+-+*/
+-+struct vcsm_user_clean_invalid_s {
+-+   struct {
+-+      unsigned int cmd;
+-+      unsigned int handle;
+-+      unsigned int addr;
+-+      unsigned int size;
+-+   } s[8];
+-+};
+++::mc_end
+++# Do not add code here because mc_end must appear after all other code.
++diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
++new file mode 100644
++index 0000000..27cbb59
++--- /dev/null
+++++ b/libavcodec/rpi_shader_cmd.h
++@@ -0,0 +1,88 @@
+++#ifndef RPI_SHADER_CMD_H
+++#define RPI_SHADER_CMD_H
+++
+++#pragma pack(push, 4)
+++
+++typedef struct qpu_mc_pred_c_s {
+++    uint32_t next_fn;
+++    int16_t next_src_y;
+++    int16_t next_src_x;
+++    uint32_t next_src_base_c;
+++    union {
+++        struct {
+++            uint16_t h;
+++            uint16_t w;
+++            uint32_t coeffs_x;
+++            uint32_t coeffs_y;
+++            uint32_t wo_u;
+++            uint32_t wo_v;
+++            uint32_t dst_addr_c;
+++        } p;
+++        struct {
+++            uint16_t h;
+++            uint16_t w;
+++            uint32_t coeffs_x;
+++            uint32_t coeffs_y;
+++            uint32_t weight_u;
+++            uint32_t weight_v;
+++            uint32_t dummy0;
+++        } b0;
+++        struct {
+++            uint32_t dummy0;
+++            uint32_t coeffs_x;
+++            uint32_t coeffs_y;
+++            uint32_t wo_u;
+++            uint32_t wo_v;
+++            uint32_t dst_addr_c;
+++        } b1;
+++        struct {
+++            uint32_t pic_cw;            // C Width (== Y width / 2)
+++            uint32_t pic_ch;            // C Height (== Y Height / 2)
+++            uint32_t stride2;
+++            uint32_t stride1;
+++            uint32_t wdenom;
+++            uint32_t dummy0;
+++        } s0;
+++        struct {
+++            uint32_t dummy0;
+++            uint32_t dummy1;
+++            uint32_t dummy2;
+++            uint32_t dummy3;
+++            uint32_t dummy4;
+++            uint32_t dummy5;
+++        } s1;
+++    };
+++} qpu_mc_pred_c_t;
+++
+++typedef struct qpu_mc_pred_y_s {
+++    int16_t next_src1_x;
+++    int16_t next_src1_y;
+++    uint32_t next_src1_base;
+++    int16_t next_src2_x;
+++    int16_t next_src2_y;
+++    uint32_t next_src2_base;
+++    union {
+++        struct {
+++            uint16_t h;
+++            uint16_t w;
+++            uint32_t mymx21;
+++            uint32_t wo1;
+++            uint32_t wo2;
+++            uint32_t dst_addr;
+++        } p;
+++        struct {
+++            uint16_t pic_h;
+++            uint16_t pic_w;
+++            uint32_t stride2;
+++            uint32_t stride1;
+++            uint32_t wdenom;
+++            uint32_t dummy0;
+++        } s;
+++    };
+++    uint32_t next_fn;
+++} qpu_mc_pred_y_t;
+ +
+-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
+++#pragma pack(pop)
+ +
+-+#ifdef __cplusplus
+-+}
+ +#endif
+ +
+-+#endif /* __USER_VCSM__H__INCLUDED__ */
+ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+ new file mode 100644
+-index 0000000..9580165
++index 0000000..b061fe0
+ --- /dev/null
+ +++ b/libavcodec/rpi_zc.c
+-@@ -0,0 +1,406 @@
++@@ -0,0 +1,581 @@
+ +#include "config.h"
+ +#ifdef RPI
+ +#include "rpi_qpu.h"
+++#include "rpi_mailbox.h"
+ +#include "rpi_zc.h"
+++#include "libavutil/avassert.h"
+++#include <pthread.h>
+ +
+ +#include "libavutil/buffer_internal.h"
+++#include <interface/vctypes/vc_image_types.h>
+++
+++#define TRACE_ALLOC 0
+ +
+ +struct ZcPoolEnt;
+ +
+ +typedef struct ZcPool
+ +{
+ +    int numbytes;
+++    unsigned int n;
+ +    struct ZcPoolEnt * head;
+ +    pthread_mutex_t lock;
+ +} ZcPool;
+@@ -14981,27 +17836,56 @@ index 0000000..9580165
+ +{
+ +    // It is important that we start with gmem as other bits of code will expect to see that
+ +    GPU_MEM_PTR_T gmem;
+++    unsigned int n;
+ +    struct ZcPoolEnt * next;
+ +    struct ZcPool * pool;
+ +} ZcPoolEnt;
+ +
+-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
+++#if 1
+++//#define ALLOC_PAD       0x1000
+++#define ALLOC_PAD       0
+++#define ALLOC_ROUND     0x1000
+++//#define ALLOC_N_OFFSET  0x100
+++#define ALLOC_N_OFFSET  0
+++#define STRIDE_ROUND    0x80
+++#define STRIDE_OR       0x80
+++#else
+++#define ALLOC_PAD       0
+++#define ALLOC_ROUND     0x1000
+++#define ALLOC_N_OFFSET  0
+++#define STRIDE_ROUND    32
+++#define STRIDE_OR       0
+++#endif
+++
+++#define DEBUG_ZAP0_BUFFERS 0
+++
+++
+++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
+ +{
+ +    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
+ +
+++    // Round up to 4k & add 4k
+++    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
+++
+ +    if (zp == NULL) {
+ +        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
+ +        goto fail0;
+ +    }
+ +
+-+    if (gpu_malloc_cached(size, &zp->gmem) != 0)
+++    if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
+ +    {
+-+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
+++        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
+ +        goto fail1;
+ +    }
+ +
+++#if TRACE_ALLOC
+++    printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
+++#endif
+++
+++    pool->numbytes = zp->gmem.numbytes;
+ +    zp->next = NULL;
+ +    zp->pool = pool;
+++    zp->n = pool->n++;
+ +    return zp;
+ +
+ +fail1:
+@@ -15012,6 +17896,10 @@ index 0000000..9580165
+ +
+ +static void zc_pool_ent_free(ZcPoolEnt * const zp)
+ +{
+++#if TRACE_ALLOC
+++    printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
+++#endif
+++
+ +    gpu_free(&zp->gmem);
+ +    av_free(zp);
+ +}
+@@ -15020,6 +17908,8 @@ index 0000000..9580165
+ +{
+ +    ZcPoolEnt * p = pool->head;
+ +    pool->head = NULL;
+++    pool->numbytes = -1;
+++
+ +    while (p != NULL)
+ +    {
+ +        ZcPoolEnt * const zp = p;
+@@ -15028,15 +17918,21 @@ index 0000000..9580165
+ +    }
+ +}
+ +
+-+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes)
+++static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
+ +{
+ +    ZcPoolEnt * zp;
+++    int numbytes;
+++
+ +    pthread_mutex_lock(&pool->lock);
+ +
+-+    if (numbytes != pool->numbytes)
+++    numbytes = pool->numbytes;
+++
+++    // If size isn't close then dump the pool
+++    // Close in this context means within 128k
+++    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
+ +    {
+ +        zc_pool_flush(pool);
+-+        pool->numbytes = numbytes;
+++        numbytes = req_bytes;
+ +    }
+ +
+ +    if (pool->head != NULL)
+@@ -15050,6 +17946,10 @@ index 0000000..9580165
+ +    }
+ +
+ +    pthread_mutex_unlock(&pool->lock);
+++
+++    // Start with our buffer empty of preconceptions
+++//    rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE);
+++
+ +    return zp;
+ +}
+ +
+@@ -15059,6 +17959,10 @@ index 0000000..9580165
+ +    if (zp != NULL)
+ +    {
+ +        pthread_mutex_lock(&pool->lock);
+++#if TRACE_ALLOC
+++        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
+++#endif
+++
+ +        if (pool->numbytes == zp->gmem.numbytes)
+ +        {
+ +            zp->next = pool->head;
+@@ -15089,10 +17993,18 @@ index 0000000..9580165
+ +    pthread_mutex_destroy(&pool->lock);
+ +}
+ +
+++typedef struct ZcOldCtxVals
+++{
+++    int thread_safe_callbacks;
+++    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
+++    void * get_buffer_context;
+++} ZcOldCtxVals;
+ +
+ +typedef struct AVZcEnv
+ +{
+++    unsigned int refcount;
+ +    ZcPool pool;
+++    ZcOldCtxVals old;
+ +} ZcEnv;
+ +
+ +// Callback when buffer unrefed to zero
+@@ -15112,28 +18024,94 @@ index 0000000..9580165
+ +}
+ +
+ +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+-+    const unsigned int video_width, const unsigned int video_height)
+++    const int format, const unsigned int video_width, const unsigned int video_height)
+ +{
+ +    AVRpiZcFrameGeometry geo;
+-+    geo.stride_y = (video_width + 32 + 31) & ~31;
+-+    geo.stride_c = geo.stride_y / 2;
+-+//    geo.height_y = (video_height + 15) & ~15;
+-+    geo.height_y = (video_height + 32 + 31) & ~31;
+-+    geo.height_c = geo.height_y / 2;
+++
+++    switch (format)
+++    {
+++        case AV_PIX_FMT_YUV420P:
+++            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+++        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
+++            geo.stride_c = geo.stride_y / 2;
+++        //    geo.height_y = (video_height + 15) & ~15;
+++            geo.height_y = (video_height + 32 + 31) & ~31;
+++            geo.height_c = geo.height_y / 2;
+++            geo.planes_c = 2;
+++            geo.stripes = 1;
+++            break;
+++
+++        case AV_PIX_FMT_SAND128:
+++        {
+++            const unsigned int stripe_w = 128;
+++
+++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+++            static VC_IMAGE_T img = {0};
+++
+++            // Given the overhead of calling the mailbox keep a stashed
+++            // copy as we will almost certainly just want the same numbers again
+++            // but that means we need a lock
+++            pthread_mutex_lock(&sand_lock);
+++
+++            if (img.width != video_width || img.height != video_height)
+++            {
+++                VC_IMAGE_T new_img = {
+++                    .type = VC_IMAGE_YUV_UV,
+++                    .width = video_width,
+++                    .height = video_height
+++                };
+++
+++                gpu_ref();
+++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
+++                gpu_unref();
+++                img = new_img;
+++            }
+++
+++            geo.stride_y = stripe_w;
+++            geo.stride_c = stripe_w;
+++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+++            geo.height_c = img.pitch / stripe_w - geo.height_y;
+++            geo.planes_c = 1;
+++            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
+++
+++            pthread_mutex_unlock(&sand_lock);
+++
+++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
+++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
+++            break;
+++        }
+++
+++        default:
+++            memset(&geo, 0, sizeof(geo));
+++            break;
+++    }
+ +    return geo;
+ +}
+ +
+++
+ +static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
+ +{
+ +    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
+ +    AVBufferRef * buf;
+++    intptr_t idata = (intptr_t)zp->gmem.arm;
+++#if ALLOC_N_OFFSET != 0
+++    intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
+++#endif
+ +
+ +    if (zp == NULL) {
+ +        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
+ +        goto fail0;
+ +    }
+ +
+-+    if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+++#if ALLOC_N_OFFSET != 0
+++    idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
+++#endif
+++
+++#if DEBUG_ZAP0_BUFFERS
+++    memset((void*)idata, 0, size);
+++#endif
+++
+++    if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+ +    {
+ +        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
+ +        goto fail2;
+@@ -15147,13 +18125,12 @@ index 0000000..9580165
+ +    return NULL;
+ +}
+ +
+-+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame)
+++static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
+ +{
+-+    ZcEnv *const zc = s->get_buffer_context;
+-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height);
+++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
+ +    const unsigned int size_y = geo.stride_y * geo.height_y;
+ +    const unsigned int size_c = geo.stride_c * geo.height_c;
+-+    const unsigned int size_pic = size_y + size_c * 2;
+++    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
+ +    AVBufferRef * buf;
+ +    unsigned int i;
+ +
+@@ -15161,7 +18138,7 @@ index 0000000..9580165
+ +
+ +    if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
+ +    {
+-+        av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+++        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+ +        return AVERROR(ENOMEM);
+ +    }
+ +
+@@ -15172,19 +18149,24 @@ index 0000000..9580165
+ +    }
+ +
+ +    frame->buf[0] = buf;
+++
+ +    frame->linesize[0] = geo.stride_y;
+ +    frame->linesize[1] = geo.stride_c;
+ +    frame->linesize[2] = geo.stride_c;
+++    if (geo.stripes > 1)
+++        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
+++
+ +    frame->data[0] = buf->data;
+ +    frame->data[1] = frame->data[0] + size_y;
+-+    frame->data[2] = frame->data[1] + size_c;
+++    if (geo.planes_c > 1)
+++        frame->data[2] = frame->data[1] + size_c;
+++
+ +    frame->extended_data = frame->data;
+ +    // Leave extended buf alone
+ +
+ +    return 0;
+ +}
+ +
+-+
+ +#define RPI_GET_BUFFER2 1
+ +
+ +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
+@@ -15194,21 +18176,25 @@ index 0000000..9580165
+ +#else
+ +    int rv;
+ +
+-+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 ||
+-+        frame->format != AV_PIX_FMT_YUV420P)
+++    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
+ +    {
+ +//        printf("Do default alloc: format=%#x\n", frame->format);
+ +        rv = avcodec_default_get_buffer2(s, frame, flags);
+ +    }
+++    else if (frame->format == AV_PIX_FMT_YUV420P ||
+++             frame->format == AV_PIX_FMT_SAND128)
+++    {
+++        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
+++    }
+ +    else
+ +    {
+-+        rv = rpi_get_display_buffer(s, frame);
+++        rv = avcodec_default_get_buffer2(s, frame, flags);
+ +    }
+ +
+ +#if 0
+-+    printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+-+        frame->width, frame->height,
+-+        frame->linesize[0], frame->linesize[1], frame->linesize[2],
+++    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+++        frame->format, frame->width, frame->height,
+++        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
+ +        frame->data[0], frame->data[1], frame->data[2],
+ +        frame->buf[0], frame->buf[1], frame->buf[2],
+ +        av_buffer_get_opaque(frame->buf[0]));
+@@ -15229,7 +18215,7 @@ index 0000000..9580165
+ +    dest->width = src->width;
+ +    dest->height = src->height;
+ +
+-+    if (rpi_get_display_buffer(s, dest) != 0)
+++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+ +    {
+ +        return NULL;
+ +    }
+@@ -15262,14 +18248,16 @@ index 0000000..9580165
+ +{
+ +    assert(s != NULL);
+ +
+-+    if (frame->format != AV_PIX_FMT_YUV420P)
+++    if (frame->format != AV_PIX_FMT_YUV420P &&
+++        frame->format != AV_PIX_FMT_SAND128)
+ +    {
+-+        av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format);
+++        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
+ +        return NULL;
+ +    }
+ +
+ +    if (frame->buf[1] != NULL)
+ +    {
+++        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
+ +        if (maycopy)
+ +        {
+ +            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+@@ -15305,6 +18293,18 @@ index 0000000..9580165
+ +    return p == NULL ? -1 : p->vc_handle;
+ +}
+ +
+++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
+++{
+++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+++    return p == NULL ? 0 : fr_ref->data - p->arm;
+++}
+++
+++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
+++{
+++    return fr_ref == NULL ? 0 : fr_ref->size;
+++}
+++
+++
+ +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
+ +{
+ +    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+@@ -15341,27 +18341,50 @@ index 0000000..9580165
+ +    }
+ +}
+ +
+++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
+++{
+++    return s->get_buffer2 == av_rpi_zc_get_buffer2;
+++}
+++
+ +int av_rpi_zc_init(struct AVCodecContext * const s)
+ +{
+-+    ZcEnv * const zc = av_rpi_zc_env_alloc();
+-+    if (zc == NULL)
+++    if (av_rpi_zc_in_use(s))
+ +    {
+-+        return AVERROR(ENOMEM);
+++        ZcEnv * const zc = s->get_buffer_context;
+++        ++zc->refcount;
+ +    }
+++    else
+++    {
+++        ZcEnv *const zc = av_rpi_zc_env_alloc();
+++        if (zc == NULL)
+++        {
+++            return AVERROR(ENOMEM);
+++        }
+++
+++        zc->refcount = 1;
+++        zc->old.get_buffer_context = s->get_buffer_context;
+++        zc->old.get_buffer2 = s->get_buffer2;
+++        zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
+ +
+-+    s->get_buffer_context = zc;
+-+    s->get_buffer2 = av_rpi_zc_get_buffer2;
+++        s->get_buffer_context = zc;
+++        s->get_buffer2 = av_rpi_zc_get_buffer2;
+++        s->thread_safe_callbacks = 1;
+++    }
+ +    return 0;
+ +}
+ +
+ +void av_rpi_zc_uninit(struct AVCodecContext * const s)
+ +{
+-+    if (s->get_buffer2 == av_rpi_zc_get_buffer2)
+++    if (av_rpi_zc_in_use(s))
+ +    {
+ +        ZcEnv * const zc = s->get_buffer_context;
+-+        s->get_buffer2 = avcodec_default_get_buffer2;
+-+        s->get_buffer_context = NULL;
+-+        av_rpi_zc_env_free(zc);
+++        if (--zc->refcount == 0)
+++        {
+++            s->get_buffer2 = zc->old.get_buffer2;
+++            s->get_buffer_context = zc->old.get_buffer_context;
+++            s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
+++            av_rpi_zc_env_free(zc);
+++        }
+ +    }
+ +}
+ +
+@@ -15369,19 +18392,19 @@ index 0000000..9580165
+ +
+ diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
+ new file mode 100644
+-index 0000000..f0109f4
++index 0000000..f4aeb78
+ --- /dev/null
+ +++ b/libavcodec/rpi_zc.h
+-@@ -0,0 +1,83 @@
++@@ -0,0 +1,137 @@
+ +#ifndef LIBAVCODEC_RPI_ZC_H
+ +#define LIBAVCODEC_RPI_ZC_H
+ +
+ +// Zero-Copy frame code for RPi
+ +// RPi needs Y/U/V planes to be contiguous for display.  By default
+ +// ffmpeg will allocate separated planes so a memcpy is needed before
+-+// display.  This code prodes a method a making ffmpeg allocate a single
+-+// bit of memory for the frame when can then be refrence counted until
+-+// display ahs finsihed with it.
+++// display.  This code provides a method a making ffmpeg allocate a single
+++// bit of memory for the frame when can then be reference counted until
+++// display has finished with it.
+ +
+ +#include "libavutil/frame.h"
+ +#include "libavcodec/avcodec.h"
+@@ -15398,10 +18421,13 @@ index 0000000..f0109f4
+ +    unsigned int height_y;
+ +    unsigned int stride_c;
+ +    unsigned int height_c;
+++    unsigned int planes_c;
+++    unsigned int stripes;
+ +} AVRpiZcFrameGeometry;
+ +
+ +
+ +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+++    const int format,
+ +    const unsigned int video_width, const unsigned int video_height);
+ +
+ +// Replacement fn for avctx->get_buffer2
+@@ -15410,7 +18436,7 @@ index 0000000..f0109f4
+ +// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
+ +// must be set to 1 as otherwise the buffer info is killed before being returned
+ +// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
+-+// return must be manually derefed with av_frame_unref.  This should be done
+++// returned must be manually derefed with av_frame_unref.  This should be done
+ +// after av_rpi_zc_ref has been called.
+ +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
+ +
+@@ -15427,6 +18453,11 @@ index 0000000..f0109f4
+ +// Get the vc_handle from the frame ref
+ +// Returns -1 if ref doesn't look valid
+ +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
+++// Get offset from the start of the memory referenced
+++// by the vc_handle to valid data
+++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
+++// Length of buffer data
+++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
+ +// Get the number of bytes allocated from the frame ref
+ +// Returns 0 if ref doesn't look valid
+ +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
+@@ -15443,6 +18474,8 @@ index 0000000..f0109f4
+ +// Allocate the environment used by the ZC code
+ +void av_rpi_zc_env_free(AVZcEnvPtr);
+ +
+++// Test to see if the context is using zc (checks get_buffer2)
+++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+ +
+ +// Init ZC into a context
+ +// There is nothing magic in this fn - it just packages setting
+@@ -15454,10 +18487,54 @@ index 0000000..f0109f4
+ +// get_buffer2 & get_buffer_context
+ +void av_rpi_zc_uninit(struct AVCodecContext * const s);
+ +
+++
+++
+++static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
+++{
+++    return frame->linesize[3];
+++}
+++
+++static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    const unsigned int stride1 = frame->linesize[0];
+++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
+++    const unsigned int x1 = x & (stride1 - 1);
+++    const unsigned int x2 = x ^ x1;
+++
+++    return x1 + stride1 * y + stride2 * x2;
+++}
+++
+++static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+++{
+++    const unsigned int stride1 = frame->linesize[0];
+++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
+++    const unsigned int x = x_c * 2;
+++    const unsigned int x1 = x & (stride1 - 1);
+++    const unsigned int x2 = x ^ x1;
+++
+++    return x1 + stride1 * y_c + stride2 * x2;
+++}
+++
+++static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
+++}
+++
+++static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
+++}
+++
+++static inline int rpi_sliced_frame(const AVFrame * const frame)
+++{
+++    return frame->format == AV_PIX_FMT_SAND128;
+++}
+++
+++
+ +#endif
+ +
+ diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+-index f7adb52..708526e 100644
++index f7adb52..3b398a3 100644
+ --- a/libavcodec/utils.c
+ +++ b/libavcodec/utils.c
+ @@ -26,6 +26,12 @@
+@@ -15547,6 +18624,30 @@ index f7adb52..708526e 100644
+                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                       CONFIG_MEMORY_POISONING ?
+                                                          NULL :
++@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
++ {
++     int ret;
++ 
+++#ifdef RPI
+++    // This is going to end badly if we let it continue
+++    av_assert0(frame->format != AV_PIX_FMT_SAND128);
+++#endif
+++
++     if ((ret = update_frame_pool(avctx, frame)) < 0)
++         return ret;
++ 
++diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
++index 21f8d9e..71ce7b9 100644
++--- a/libavfilter/avfilter.c
+++++ b/libavfilter/avfilter.c
++@@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
++                    "options, but options were provided: %s.\n", args);
++             return AVERROR(EINVAL);
++         }
+++        printf("=== args='%s'\n", args);
++ 
++ #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
++             if (   !strcmp(filter->filter->name, "format")     ||
+ diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
+ index b31d233..2767306 100644
+ --- a/libavformat/mpegts.c
+@@ -15601,6 +18702,88 @@ index 0c0ce12..82e0bc3 100644
+  /**
+   * @}
+   */
++diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
++index 0dffa4d..5644176 100644
++--- a/libavutil/pixdesc.c
+++++ b/libavutil/pixdesc.c
++@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
++         .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
++                  AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
++     },
+++    [AV_PIX_FMT_SAND128] = {
+++        .name = "sand128",
+++        .nb_components = 3,
+++        .log2_chroma_w = 1,
+++        .log2_chroma_h = 1,
+++        .comp = {
+++            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
+++            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
+++            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
+++        },
+++        .flags = 0,
+++    }
++ };
++ #if FF_API_PLUS1_MINUS1
++ FF_ENABLE_DEPRECATION_WARNINGS
++diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
++index 0ed01c4..4705e80 100644
++--- a/libavutil/pixfmt.h
+++++ b/libavutil/pixfmt.h
++@@ -303,7 +303,10 @@ enum AVPixelFormat {
++     AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
++     AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian
++ 
++-    AV_PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+++// RPI - not on ifdef so can be got at by calling progs
+++    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+++
+++    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
++ };
++ 
++ #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
++diff --git a/libswscale/input.c b/libswscale/input.c
++index 14ab5ab..e61b67a 100644
++--- a/libswscale/input.c
+++++ b/libswscale/input.c
++@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
++     }
++ }
++ 
+++
+++static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
+++                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+++                       int width, uint32_t *unused)
+++{
+++    // NIF
+++}
+++
++ #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
++ 
++ static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
++@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
++     case AV_PIX_FMT_P010BE:
++         c->chrToYV12 = p010BEToUV_c;
++         break;
+++    case AV_PIX_FMT_SAND128:
+++        c->chrToYV12 = sand128ToUV_c;
+++        break;
++     }
++     if (c->chrSrcHSubSample) {
++         switch (srcFormat) {
++diff --git a/libswscale/utils.c b/libswscale/utils.c
++index 576d8f0..d7206cc 100644
++--- a/libswscale/utils.c
+++++ b/libswscale/utils.c
++@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
++     [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
++     [AV_PIX_FMT_P010LE]      = { 1, 0 },
++     [AV_PIX_FMT_P010BE]      = { 1, 0 },
+++#ifdef RPI
+++    [AV_PIX_FMT_SAND128]     = { 1, 0 },
+++#endif
++ };
++ 
++ int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
+ diff --git a/pi-util/conf.sh b/pi-util/conf.sh
+ new file mode 100755
+ index 0000000..8b596a2
+@@ -15640,21 +18823,61 @@ index 0000000..8b596a2
+ +
+ +# gcc option for getting asm listing
+ +# -Wa,-ahls
++diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh
++new file mode 100644
++index 0000000..160e149
++--- /dev/null
+++++ b/pi-util/conf1.sh
++@@ -0,0 +1,34 @@
+++echo "Configure for Pi1"
+++
+++RPI_BUILDROOT=`pwd`/build
+++RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+++RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+++RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+++#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+++#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+++RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+++#RPI_KEEPS="-save-temps=obj"
+++RPI_KEEPS=""
+++
+++./configure --enable-cross-compile\
+++ --cpu=arm1176jzf-s\
+++ --arch=armv\
+++ --disable-neon\
+++ --target-os=linux\
+++ --disable-stripping\
+++ --enable-mmal\
+++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++
+++
+++# --enable-extra-warnings\
+++# --arch=armv71\
+++# --enable-shared\
+++
+++# gcc option for getting asm listing
+++# -Wa,-ahls
+ diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+ new file mode 100644
+-index 0000000..61d1399
++index 0000000..fc14f2a
+ --- /dev/null
+ +++ b/pi-util/conf_h265.csv
+ @@ -0,0 +1,144 @@
+ +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+-+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+ +1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+ +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+ +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+ +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+ +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+ +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+-+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+ +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+ +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+ +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+@@ -15676,7 +18899,7 @@ index 0000000..61d1399
+ +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+ +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+ +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+-+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+ +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+ +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+ +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+@@ -15716,7 +18939,7 @@ index 0000000..61d1399
+ +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+ +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+ +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+-+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+ +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+ +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+ +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+@@ -15730,10 +18953,10 @@ index 0000000..61d1399
+ +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+ +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+ +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+-+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+ +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+ +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+-+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+ +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+ +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+ +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+@@ -15762,7 +18985,7 @@ index 0000000..61d1399
+ +1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+ +1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+ +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+-+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+ +1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+ +1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+ +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+@@ -15771,9 +18994,9 @@ index 0000000..61d1399
+ +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+ +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+ +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+-+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
+++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
+ +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+-+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+ +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+ +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+ +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+@@ -15792,10 +19015,10 @@ index 0000000..61d1399
+ +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+ diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+ new file mode 100644
+-index 0000000..38f942f
++index 0000000..c896bc6
+ --- /dev/null
+ +++ b/pi-util/ffconf.py
+-@@ -0,0 +1,146 @@
++@@ -0,0 +1,154 @@
+ +#!/usr/bin/env python
+ +
+ +import os
+@@ -15839,16 +19062,18 @@ index 0000000..38f942f
+ +    except:
+ +        pass
+ +
+-+    rv = False
+ +    if  m1 and m2 and m1.group() == m2.group():
+ +        print >> flog, "Match: " + m1.group()
+-+        rv = True
+++        rv = 0
+ +    elif not m1:
+ +        print >> flog, "****** Cannot find m1"
+++        rv = 3
+ +    elif not m2:
+ +        print >> flog, "****** Cannot find m2"
+++        rv = 2
+ +    else:
+ +        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
+++        rv = 1
+ +    flog.close()
+ +    return rv
+ +
+@@ -15894,19 +19119,25 @@ index 0000000..38f942f
+ +            print "==== ", name,
+ +            sys.stdout.flush()
+ +
+-+            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
+-+                if exp_test == 1:
+-+                    failures.append(name)
+-+                    print ": * FAIL *"
+-+                else:
+-+                    print ": fail"
+-+            else:
+++            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
+++            if (rv == 0):
+ +                if exp_test == 2:
+ +                    print ": * OK *"
+ +                    unx_success.append(name)
+ +                else:
+ +                    print ": ok"
+-+
+++            elif exp_test > 1 and rv == 1:
+++                print ": fail"
+++            else:
+++                failures.append(name)
+++                if rv == 1:
+++                    print ": * FAIL *"
+++                elif (rv == 2) :
+++                    print ": * CRASH *"
+++                elif (rv == 3) :
+++                    print ": * MD5 MISSING *"
+++                else :
+++                    print ": * BANG *"
+ +
+ +    if failures or unx_success:
+ +        print "Unexpected Failures:", failures
+@@ -18450,6 +21681,21 @@ index 0000000..1eacc04
+ +
+ +if __name__ == '__main__':
+ +   main()
++diff --git a/pi-util/qem.sh b/pi-util/qem.sh
++new file mode 100644
++index 0000000..47dd071
++--- /dev/null
+++++ b/pi-util/qem.sh
++@@ -0,0 +1,9 @@
+++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+++QASM=python\ pi-util/qasm.py
+++SRC_FILE=libavcodec/rpi_shader.qasm
+++DST_BASE=shader
+++
+++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
+++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+++
+ diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
+ new file mode 100755
+ index 0000000..6a9a33f
+@@ -18542,3 +21788,137 @@ index 0000000..d8bdd91
+ +pi-util/rebase_liblinks.py $DST
+ +
+ +
++diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
++new file mode 100644
++index 0000000..5935a11
++--- /dev/null
+++++ b/pi-util/v3dusage.py
++@@ -0,0 +1,128 @@
+++#!/usr/bin/env python
+++
+++import sys
+++import argparse
+++import re
+++
+++def do_logparse(logname):
+++
+++    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
+++    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
+++    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
+++    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
+++
+++    ttotal = {'idle':0.0}
+++    tstart = {}
+++    qctotal = {}
+++    qtstotal = {}
+++    l2hits = {}
+++    l2total = {}
+++    time0 = None
+++    idle_start = None
+++    qpu_op_no = 0
+++    op_count = 0
+++
+++    with open(logname, "rt") as infile:
+++        for line in infile:
+++            match = rmatch.match(line)
+++            if match:
+++#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
+++                time = float(match.group(1))
+++                unit = match.group(3)
+++                opstart = not match.group(2)
+++                optype = match.group(7)
+++                hascb = match.group(8) != "0"
+++
+++                if unit == 'qpu1':
+++                    unit = unit + "." + str(qpu_op_no)
+++                    if not opstart:
+++                        if hascb or optype == 'EXECUTE_SYNC':
+++                            qpu_op_no = 0
+++                        else:
+++                            qpu_op_no += 1
+++
+++                # Ignore sync type
+++                if optype == 'EXECUTE_SYNC':
+++                    continue
+++
+++                if not time0:
+++                    time0 = time
+++
+++                if opstart:
+++                    tstart[unit] = time;
+++                elif unit in tstart:
+++                    op_count += 1
+++                    if not unit in ttotal:
+++                        ttotal[unit] = 0.0
+++                    ttotal[unit] += time - tstart[unit]
+++                    del tstart[unit]
+++
+++                if not idle_start and not tstart:
+++                    idle_start = time
+++                elif idle_start and tstart:
+++                    ttotal['idle'] += time - idle_start
+++                    idle_start = None
+++
+++            match = rqcycle.match(line)
+++            if match:
+++                unit = "qpu1." + str(qpu_op_no)
+++                if not unit in qctotal:
+++                    qctotal[unit] = 0
+++                qctotal[unit] += int(match.group(2))
+++
+++            match = rqtscycle.match(line)
+++            if match:
+++                unit = "qpu1." + str(qpu_op_no)
+++                if not unit in qtstotal:
+++                    qtstotal[unit] = 0
+++                qtstotal[unit] += int(match.group(2))
+++
+++            match = rl2hits.match(line)
+++            if match:
+++                unit = "qpu1." + str(qpu_op_no)
+++                if not unit in l2total:
+++                    l2total[unit] = 0
+++                    l2hits[unit] = 0
+++                l2total[unit] += int(match.group(3))
+++                if match.group(2) == "hits":
+++                    l2hits[unit] += int(match.group(3))
+++
+++
+++    if not time0:
+++        print "No v3d profile records found"
+++    else:
+++        tlogged = time - time0
+++
+++        print "Logged time:", tlogged, "  Op count:", op_count
+++        for unit in sorted(ttotal):
+++            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
+++        print
+++        for unit in sorted(qctotal):
+++            if not unit in qtstotal:
+++                qtstotal[unit] = 0;
+++            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
+++            if unit in l2total:
+++                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
+++
+++
+++
+++if __name__ == '__main__':
+++    argp = argparse.ArgumentParser(
+++        formatter_class=argparse.RawDescriptionHelpFormatter,
+++        description="QPU/VPU perf summary from VC logging",
+++        epilog = """
+++Will also summarise TMU stalls if logging requests set in qpu noflush param
+++in the profiled code.
+++
+++Example use:
+++  vcgencmd set_logging level=0xc0
+++  <command to profile>
+++  sudo vcdbg log msg >& t.log
+++  v3dusage.py t.log
+++""")
+++
+++    argp.add_argument("logfile")
+++    args = argp.parse_args()
+++
+++    do_logparse(args.logfile)
+++
diff --git a/projects/RPi2/patches/kodi/kodi-001-backport.patch b/projects/RPi2/patches/kodi/kodi-001-backport.patch
index 08f5de4bc2..6a250524a2 100644
--- a/projects/RPi2/patches/kodi/kodi-001-backport.patch
+++ b/projects/RPi2/patches/kodi/kodi-001-backport.patch
@@ -1,7 +1,91 @@
-From 2ba9dbed84a444bc39a9d83d963e518239a2d8ec Mon Sep 17 00:00:00 2001
+From 6cebd3b7186d58ee1dd14263f532f9a8c6f005bd Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Tue, 28 Oct 2014 00:19:40 +0000
+Subject: [PATCH 01/75] [cec] Add settings for configuring button repeats
+
+---
+ addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
+ system/peripherals.xml                              |  4 +++-
+ xbmc/peripherals/devices/PeripheralCecAdapter.cpp   | 16 ++++++++++++++++
+ 3 files changed, 34 insertions(+), 1 deletion(-)
+
+diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
+index e0060d1fae556de529274dbc6be07455701573a3..6443f3dd885bf0aa8e031039e36e273972a310ae 100644
+--- a/addons/resource.language.en_gb/resources/strings.po
++++ b/addons/resource.language.en_gb/resources/strings.po
+@@ -19745,3 +19745,18 @@ msgstr ""
+ msgctxt "#39010"
+ msgid "Select sort method"
+ msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38050"
++msgid "Remote button press delay before repeating (ms)"
++msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38051"
++msgid "Remote button press repeat rate (ms)"
++msgstr ""
++
++#: system/peripherals.xml
++msgctxt "#38052"
++msgid "Remote button press release time (ms)"
++msgstr ""
+diff --git a/system/peripherals.xml b/system/peripherals.xml
+index d5704b249c3065b2980dc92c7c81dc7b384187bc..02b1a9ed6fce1986bd864bba09a9df0621f9e041 100644
+--- a/system/peripherals.xml
++++ b/system/peripherals.xml
+@@ -31,7 +31,9 @@
+     <setting key="device_type" type="int" value="1" configurable="0" />
+     <setting key="wake_devices_advanced" type="string" value="" configurable="0" />
+     <setting key="standby_devices_advanced" type="string" value="" configurable="0" />
+-    <setting key="double_tap_timeout_ms" type="int" min="0" value="300" configurable="0" />
++    <setting key="double_tap_timeout_ms" type="int" min="50" max="1000" step="50" value="300" label="38050" order="16" />
++    <setting key="button_repeat_rate_ms" type="int" min="0" max="250" step="10" value="0" label="38051" order="17" />
++    <setting key="button_release_delay_ms" type="int" min="0" max="500" step="50" value="0" label="38052" order="18" />
+   </peripheral>
+ 
+   <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
+diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+index d032ffd707fee5eec035e90bdf618530f7215c37..30367a3fde956090afdca9930fa52e829f35046f 100644
+--- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
++++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+@@ -1296,6 +1296,20 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
+   m_configuration.bActivateSource = config.bActivateSource;
+   bChanged |= SetSetting("activate_source", m_configuration.bActivateSource == 1);
+ 
++#if defined(CEC_DOUBLE_TAP_TIMEOUT_MS_OLD)
++  m_configuration.iDoubleTapTimeout50Ms = config.iDoubleTapTimeout50Ms;
++  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeout50Ms * 50);
++#else
++  m_configuration.iDoubleTapTimeoutMs = config.iDoubleTapTimeoutMs;
++  bChanged |= SetSetting("double_tap_timeout_ms", (int)m_configuration.iDoubleTapTimeoutMs);
++#endif
++
++  m_configuration.iButtonRepeatRateMs = config.iButtonRepeatRateMs;
++  bChanged |= SetSetting("button_repeat_rate_ms", (int)m_configuration.iButtonRepeatRateMs);
++
++  m_configuration.iButtonReleaseDelayMs = config.iButtonReleaseDelayMs;
++  bChanged |= SetSetting("button_release_delay_ms", (int)m_configuration.iButtonReleaseDelayMs);
++
+   m_configuration.bPowerOffOnStandby = config.bPowerOffOnStandby;
+ 
+   m_configuration.iFirmwareVersion = config.iFirmwareVersion;
+@@ -1398,6 +1412,8 @@ void CPeripheralCecAdapter::SetConfigurationFromSettings(void)
+   // backwards compatibility. will be removed once the next major release of libCEC is out
+   m_configuration.iDoubleTapTimeoutMs = GetSettingInt("double_tap_timeout_ms");
+ #endif
++  m_configuration.iButtonRepeatRateMs = GetSettingInt("button_repeat_rate_ms");
++  m_configuration.iButtonReleaseDelayMs = GetSettingInt("button_release_delay_ms");
+ 
+   if (GetSettingBool("pause_playback_on_deactivate"))
+   {
+
+From 0fdeeb63794764ebdd628e52d170bf8bac330efd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Apr 2014 17:27:52 +0100
-Subject: [PATCH 02/71] [cec] Don't suspend pi on tv switch off - it can't wake
+Subject: [PATCH 02/75] [cec] Don't suspend pi on tv switch off - it can't wake
  up
 
 ---
@@ -22,10 +106,10 @@ index 02b1a9ed6fce1986bd864bba09a9df0621f9e041..54f9b70cfd5c8c82ceb99932e1b3e325
      <setting key="use_tv_menu_language" type="bool" value="1" label="36018" order="10" />
      <setting key="pause_playback_on_deactivate" type="bool" value="1" label="36033" configurable="0" />
 
-From 936c12492b75b00bc991b1fbc0bfc740a099206c Mon Sep 17 00:00:00 2001
+From 36f4544b7ac9c810c875e8ae19ab92b3f3dafb59 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Apr 2014 18:19:32 +0100
-Subject: [PATCH 03/71] [rbp/omxplayer] When opening a stream don't try to
+Subject: [PATCH 03/75] [rbp/omxplayer] When opening a stream don't try to
  update gui so often
 
 ---
@@ -49,10 +133,10 @@ index c8fe0706d128b3c67a4000894129ae0fa08bb223..8a5916299575661743131b921a27a76f
          dialog->ProcessRenderLoop(false);
          if (allowCancel && dialog->IsCanceled())
 
-From d557ef01432ab1b17a41ecf339259c4c2a95a58e Mon Sep 17 00:00:00 2001
+From 2be0471046b5e75078f1a284348b3d2fbd033555 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 8 Mar 2014 15:36:06 +0000
-Subject: [PATCH 04/71] [hifiberry] Hack: force it to be recognised as IEC958
+Subject: [PATCH 04/75] [hifiberry] Hack: force it to be recognised as IEC958
  capable to enable passthrough options
 
 ---
@@ -75,10 +159,10 @@ index d66993a09583d8f9f54f5f97c18fbba45dddee9b..3c0b691860ace57e0a25f01013df01a5
          info.m_displayName.substr(info.m_displayName.size()-5) == " HDMI")
      {
 
-From 243a6a522a997e5502dfc644415352bab635e26e Mon Sep 17 00:00:00 2001
+From e2b718b239b65f2132406355dfdf9c66da744b9c Mon Sep 17 00:00:00 2001
 From: Ben Avison <bavison@riscosopen.org>
 Date: Thu, 1 May 2014 16:28:39 +0100
-Subject: [PATCH 05/71] Improved file buffering in CArchive
+Subject: [PATCH 05/75] Improved file buffering in CArchive
 
 Even though memcpy is typically inlined by the compiler into byte/word loads
 and stores (at least for release builds), the frequency with which 1, 2 and 4
@@ -138,10 +222,10 @@ index 23cac2759fb10d532da56fa75c5528c5589e9010..89d31d4db1afa7340ed8cd51a7a9fa7a
      }
  
 
-From b5f43e1c7e25eb7ddfc72def17fa6ce252febf57 Mon Sep 17 00:00:00 2001
+From e59492cefc6ebc66027e7fb96475f14ad14a650c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 10 Aug 2014 16:45:16 +0100
-Subject: [PATCH 06/71] filesystem: Make support of browsing into archives
+Subject: [PATCH 06/75] filesystem: Make support of browsing into archives
  optional
 
 The ability to browse, scan and play content in archives can cause problems on low powered/low memory devices.
@@ -251,10 +335,10 @@ index a0fd0a9011e71f4af1535110c696b6ea5c4b37db..688b71a297c7c617c6764bfe6be157d7
    {
      CURL xbtUrl = URIUtils::CreateArchivePath("xbt", url);
 
-From 199df29247e2fa52ad74270db9496bb816e955aa Mon Sep 17 00:00:00 2001
+From 73698542aed16c452fc15f5cd5a438e127676b68 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 13:06:57 +0000
-Subject: [PATCH 07/71] [rbp] Make cachemembuffersize default depend on memory
+Subject: [PATCH 07/75] [rbp] Make cachemembuffersize default depend on memory
  size
 
 ---
@@ -356,10 +440,10 @@ index 91574029c28c4fabacb4bc022aa028dcaf299adb..46d72aa072d34119f4a7273dc8f71176
  }
  
 
-From 60d038a72dc2787e87e78241da4293c40c6e8be0 Mon Sep 17 00:00:00 2001
+From 48eb57a16b9d386dc54b42ab04700f8f7f85fab9 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 30 May 2014 14:58:43 +0100
-Subject: [PATCH 08/71] [settings] Experiment: Report DESKTOP resolution in
+Subject: [PATCH 08/75] [settings] Experiment: Report DESKTOP resolution in
  video settings
 
 ---
@@ -381,10 +465,10 @@ index ef95bc286fa982790248bad26da3c3e00c1da002..da69c6960867621d4ebe9267929664d9
          StringUtils::Format("%dx%d%s", resolution->width, resolution->height,
                              ModeFlagsToString(resolution->flags, false).c_str()),
 
-From 339e21959b9c8cb48571e1cc17d14c83240043ab Mon Sep 17 00:00:00 2001
+From 952474c036385667d8ec894c178f58490af6f69c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 24 Sep 2014 23:13:52 +0100
-Subject: [PATCH 09/71] [audio] Add settings option to boost centre channel
+Subject: [PATCH 09/75] [audio] Add settings option to boost centre channel
  when downmixing
 
 This allows a dB volume increase to be added to centre channel.
@@ -510,10 +594,10 @@ index f16b822ed7b4aebe18b5d339b3f71ee66e97c23f..993d4b33a294e88c2c004b7943895ba5
      // stereo upmix
      if (upmix && m_src_channels == 2 && m_dst_channels > 2)
 
-From ab24bce0380f80ad67a895d0fb76b65915712a1a Mon Sep 17 00:00:00 2001
+From 1296ca8ae16f160bd8bdf00491582f94577122c5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 15:23:51 +0000
-Subject: [PATCH 10/71] [rbp] Default extract thumbnails to false
+Subject: [PATCH 10/75] [rbp] Default extract thumbnails to false
 
 It can take 80 seconds for a single file on a Pi. It can cause crashes with out-of-memory errors.
 It genereates a lot of support issues. Best to default to disabled and let users enable it if they must
@@ -539,10 +623,10 @@ index e8b0d3d472b02fd161a4b51e957b9129e3cb9792..289dc55ec41aa44848519a05f8ee1ccc
      </category>
    </section>
 
-From 1f2afb5a24ad283a0113d138efaafa05a8c983c3 Mon Sep 17 00:00:00 2001
+From 221907efb819c990488518eb9c4b7cfd91151e4e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 27 Nov 2014 16:31:56 +0000
-Subject: [PATCH 11/71] [languageinvoker] Reduce priority of python threads
+Subject: [PATCH 11/75] [languageinvoker] Reduce priority of python threads
 
 ---
  xbmc/interfaces/generic/LanguageInvokerThread.cpp | 5 +++++
@@ -565,10 +649,10 @@ index fcdd0633f30cd9595ae6cc4ed293677cdcb1f422..16f0c8916b5e0a9e90973d194cf2ebd1
  }
  
 
-From 275bea7284b6f326aff83305d9614ca8963d745b Mon Sep 17 00:00:00 2001
+From cf222655784da191a022a153fa5614cfbb4d79bd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 29 Nov 2014 15:25:16 +0000
-Subject: [PATCH 12/71] [rbp] hack: wait for splash to complete before changing
+Subject: [PATCH 12/75] [rbp] hack: wait for splash to complete before changing
  hdmi mode
 
 ---
@@ -652,10 +736,10 @@ index ee297700f8583dbb15cbe53baf8c887b36bd2ea0..bbe501d40c5e101f1d0d64b8b59b1928
  
    RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
 
-From 4b9d00125907996bb8db50765d6019aecc06d494 Mon Sep 17 00:00:00 2001
+From 7c77d589e065637bb0644889b520f3902b44b880 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 11 Dec 2014 17:00:57 +0000
-Subject: [PATCH 13/71] Fix for UI not showing both extractflags and
+Subject: [PATCH 13/75] Fix for UI not showing both extractflags and
  extractthumb
 
 ---
@@ -718,10 +802,10 @@ index 5f1f3ca48342ef1a4eeed7432221d7b2dda354e8..2ed5fb217c6b9f63f28d760e2a2c00b2
            <control type="toggle" />
          </setting>
 
-From 289cdf24dcdd94f577248862ca438f672302b936 Mon Sep 17 00:00:00 2001
+From 9e7d22b484cbccf5d54293a36c3cae38ce7426dd Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Thu, 11 Sep 2014 21:30:43 +0200
-Subject: [PATCH 14/71] Disable autoscrolling while on screensaver and while
+Subject: [PATCH 14/75] Disable autoscrolling while on screensaver and while
  opening streams.
 
 ---
@@ -734,10 +818,10 @@ Subject: [PATCH 14/71] Disable autoscrolling while on screensaver and while
  6 files changed, 24 insertions(+), 3 deletions(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index b8ff91b427c4fd430675aab3d1d93098c976031f..fdf7b1dc04e31ffe8e1d1b83825343b24c645b02 100644
+index 947f0937d73cde5e4a4f39ed1a7932bd1e8eb0fe..593acafd15bb0409b4446b6e598f7aa4d7baf434 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -5229,3 +5229,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
+@@ -5232,3 +5232,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
    
    return false;
  }
@@ -852,10 +936,10 @@ index d7bc1c5ba6067af9a460589920367288c640a915..ac766293f1c47c7f145cb46f6b152144
        if (m_lastRenderTime)
          m_autoScrollDelayTime += currentTime - m_lastRenderTime;
 
-From 3e7e8ad0f181636081a5844c4cbc81c4db8e2c64 Mon Sep 17 00:00:00 2001
+From 831794fa04a8589069317953f813ada9f0d3bf54 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 13 Dec 2014 18:35:20 +0000
-Subject: [PATCH 15/71] [demuxer] Avoid memcpy on every demuxer packet
+Subject: [PATCH 15/75] [demuxer] Avoid memcpy on every demuxer packet
 
 Avoids an unnecessary memcpy on every demuxer packet which for
 high bitrate videos can be significant.
@@ -955,10 +1039,10 @@ index df0f35bd49c65b302de4ccd110d859e8b881ea5f..b4b591ae4c4dd4fb0b36d4d00fedca96
      }
      catch(...) {
 
-From 5d25db06c31809475a48876f9de7e0875fdaf1f4 Mon Sep 17 00:00:00 2001
+From 9673bb4533c0a82f4712752b6f6d28f5f1ceb24e Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Wed, 25 Feb 2015 18:22:21 +0100
-Subject: [PATCH 16/71] Load OSD dialogs on startup.
+Subject: [PATCH 16/75] Load OSD dialogs on startup.
 
 Fixes skipped frames the first time they're loaded in memory on less powered
 devices, like a Raspberry Pi, when using DVDPlayer.
@@ -1053,10 +1137,10 @@ index 0534828dd85520134f7a6890e43a873e223062c1..5a86dfc1e2a54c8fe8d82cb75b612d8e
  CGUIDialogVideoSettings::~CGUIDialogVideoSettings()
  { }
 
-From a7074f84d7adec4ec29679bb9ecf84d82f3db69c Mon Sep 17 00:00:00 2001
+From 19b2018244c328f5f88f90271e31de66bea486e3 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Apr 2015 20:51:14 +0100
-Subject: [PATCH 17/71] [gui] Also limit GUI updates when in non full-screen
+Subject: [PATCH 17/75] [gui] Also limit GUI updates when in non full-screen
  video mode
 
 ---
@@ -1064,7 +1148,7 @@ Subject: [PATCH 17/71] [gui] Also limit GUI updates when in non full-screen
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index fdf7b1dc04e31ffe8e1d1b83825343b24c645b02..513deb7f27846891fb875b9263ad4d61752519ef 100644
+index 593acafd15bb0409b4446b6e598f7aa4d7baf434..f9aed6476b069ccf391697642e7999ea61b2ddcc 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
 @@ -2771,7 +2771,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
@@ -1086,10 +1170,10 @@ index fdf7b1dc04e31ffe8e1d1b83825343b24c645b02..513deb7f27846891fb875b9263ad4d61
      g_windowManager.FrameMove();
    }
 
-From ce41575e8ece9bef487b338b838520407e59ee8d Mon Sep 17 00:00:00 2001
+From b7e74e740581f7e6ab94609171000b747da9c911 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 5 May 2015 23:58:06 +0100
-Subject: [PATCH 18/71] [screensaver] Leave GUI contents available for
+Subject: [PATCH 18/75] [screensaver] Leave GUI contents available for
  screensaver
 
 ---
@@ -1119,10 +1203,10 @@ index 5808f7ed1e94d68ead7305ba6d284edd4df12bdd..2a3b7f16531c9822e79c77efabdd30ac
  
    // Add window to the history list (we must do this before we activate it,
 
-From 27a497a156eeb9fd7f589f3b121ea5fde3c4d47a Mon Sep 17 00:00:00 2001
+From fe4cef6b6e2a35352ede135ac84ff3539d1ff09e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 6 Jun 2015 18:43:57 +0100
-Subject: [PATCH 19/71] ffmpeg: Automatic switch to software decode for GMC
+Subject: [PATCH 19/75] ffmpeg: Automatic switch to software decode for GMC
  with more than one warp point
 
 ---
@@ -1350,10 +1434,10 @@ index f135d423c0ca76fd70e79ae5b7d035f0cb79fc75..d9b576bc46055fdab1c134e5f2c63cd4
        else if ((hint.codec == AV_CODEC_ID_VC1 || hint.codec == AV_CODEC_ID_WMV3) && g_RBP.GetCodecWvc1())
          supported = true;
 
-From e7c0b0639f3216a4ec732453eb3869221035c0c8 Mon Sep 17 00:00:00 2001
+From f5dabe10623f19cd9e8ea015e2d248d47c03900c Mon Sep 17 00:00:00 2001
 From: Claudio-Sjo <Claudio.Porfiri@gmail.com>
 Date: Mon, 16 Feb 2015 14:51:26 +0100
-Subject: [PATCH 20/71] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
+Subject: [PATCH 20/75] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
  - fixes #15794
 
 ---
@@ -1545,10 +1629,10 @@ index 0427af4534bfe59a343f0518c7f4242d93299836..e99236294fa8b9b613e465a8ecaf3ad3
    lsn_t m_lsnCurrent; // Position inside the track in logical sector number
    lsn_t m_lsnEnd;   // End of m_iTrack in logical sector number
 
-From ccee8a7820c164e8eef572e2c4940f407992daad Mon Sep 17 00:00:00 2001
+From 9e3b4fd8c161b01d324220252289a5b3a49fb7e8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 24 Jun 2016 19:38:13 +0100
-Subject: [PATCH 21/71] codecoverlay: Include codec name in overlay
+Subject: [PATCH 21/75] codecoverlay: Include codec name in overlay
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp | 4 ++++
@@ -1642,10 +1726,10 @@ index 0df7e72cc9d1947173c2bac5e72eb09976b51aa5..b5050081c360d29b1b478c27e6b88291
    double                    m_iSubtitleDelay;
    bool                      m_bRenderSubs;
 
-From 97a606afbd7b6502cc23b64568df6e93de332bdf Mon Sep 17 00:00:00 2001
+From 119f7291d3b7c1a57d3a86b3836c8a73a7cd1211 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 8 Mar 2016 21:20:58 +0300
-Subject: [PATCH 22/71] [DebugInfo] Add cpu usage info.
+Subject: [PATCH 22/75] [DebugInfo] Add cpu usage info.
 
 ---
  .../VideoPlayer/VideoRenderers/DebugRenderer.cpp   | 56 ++++++++--------------
@@ -1815,10 +1899,10 @@ index 420b5b5d8e6089e1049ef9af25e23d915df50dc1..fd8a0a2447c40357a9e13003f2ef45ef
  
        m_debugTimer.Set(1000);
 
-From 65148f02888c16e3ae05b0639ddd6a753ae261fa Mon Sep 17 00:00:00 2001
+From 21927619971ef137030d64a0dd102a90a7effaf0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 22 May 2015 13:56:29 +0100
-Subject: [PATCH 23/71] ffmpeg: Allow neon to be enabled in unified builds
+Subject: [PATCH 23/75] ffmpeg: Allow neon to be enabled in unified builds
 
 ---
  tools/depends/target/ffmpeg/Makefile | 4 ++++
@@ -1841,10 +1925,10 @@ index 8dd14cdfd053f142f386b6dee1fc0b21bb1f8d93..b5f38a458dfb341c43089e07afded153
  ifeq ($(OS), linux)
    ffmpg_config += --target-os=$(OS) --cpu=$(CPU)
 
-From 7b3ac50fec10531c959fa94e92e3a2c6be0b8789 Mon Sep 17 00:00:00 2001
+From 7c9767ac163fada0423cf8cc27b05f0d74482220 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 27 Feb 2015 14:37:27 +0000
-Subject: [PATCH 24/71] ffmpeg: Add some upstream HEVC optimisations
+Subject: [PATCH 24/75] ffmpeg: Add some upstream HEVC optimisations
 
 ---
  tools/depends/target/ffmpeg/Makefile               |    6 +-
@@ -5642,10 +5726,10 @@ index 0000000000000000000000000000000000000000..5e8e07d407f045fc99554f0f061d1e81
 +2.5.0
 +
 
-From 3e411c507b6f607fff9b05dc6a2f041d2a8ef986 Mon Sep 17 00:00:00 2001
+From f15eaf9000104c97d5bfc5ea046b4407cab2a261 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 14:04:18 +0100
-Subject: [PATCH 25/71] [ffmpeg] Add GPU acceleration to hevc
+Subject: [PATCH 25/75] [ffmpeg] Add GPU acceleration to hevc
 
 ---
  tools/depends/target/ffmpeg/Makefile               |     4 +-
@@ -43831,10 +43915,10 @@ index 0000000000000000000000000000000000000000..e172ebf157aebffe1ae50b4a2b25fd71
 +2.7.4
 +
 
-From 7d7851a6a6201afea2c705d6ab30494a48006f2d Mon Sep 17 00:00:00 2001
+From 88b331888a7677058bb3dfb064d7eb952b0ce1a9 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 12 Jan 2016 16:29:57 +0000
-Subject: [PATCH 26/71] ffmpeg: Add cabac opimisations for hevc
+Subject: [PATCH 26/75] ffmpeg: Add cabac opimisations for hevc
 
 ---
  .../0001-Squashed-commit-of-the-following.patch    | 2179 ++++++++++++++++++++
@@ -46079,10 +46163,10 @@ index d6856dbd4fb4957ace700cbc08332223c01938f6..a61357f14cb2139e8125ae04684bed1b
  
  make -j ${BUILDTHREADS} 
 
-From de9212a260d82d7ce6584bf11adde8aa7b9035e9 Mon Sep 17 00:00:00 2001
+From ce532b19d18df015cecb0e2e2ec85f0c89885a25 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 16 Sep 2015 19:05:12 +0100
-Subject: [PATCH 27/71] [3d] Make MVC a valid 3D filename tag
+Subject: [PATCH 27/75] [3d] Make MVC a valid 3D filename tag
 
 ---
  xbmc/guilib/StereoscopicsManager.cpp | 9 +++++++++
@@ -46143,10 +46227,10 @@ index fc526d11c3a78bc74125429120e29bf295bd3b16..6b0e3b8cf9e3ff40e6af758c54fe7eef
      bool m_useDisplayControlHWStereo;
  
 
-From 438d9a918515eba692999e310cecf2816bd68b8d Mon Sep 17 00:00:00 2001
+From df4fc81637ca4b47d4ce0e64110d8bab4bd77cd4 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 5 Oct 2015 14:58:05 +0100
-Subject: [PATCH 28/71] [3d] Swap top/bottom sides of GUI
+Subject: [PATCH 28/75] [3d] Swap top/bottom sides of GUI
 
 ---
  xbmc/guilib/GraphicContext.cpp | 2 +-
@@ -46166,10 +46250,10 @@ index 3706e4d80b3b31da4c5be0a1b21f36e59d2910f2..e170b3fb05279ffa316794dbce1d4f9d
    }
    if(m_stereoMode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
 
-From 5b43a704acfb943c6010d109418779da3a7febda Mon Sep 17 00:00:00 2001
+From 2373df61c862bc62538391596c098a80968d1c0d Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 11 Oct 2015 20:51:37 +0100
-Subject: [PATCH 29/71] Revert "Revert "Disable extra logging by default""
+Subject: [PATCH 29/75] Revert "Revert "Disable extra logging by default""
 
 This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
 ---
@@ -46196,10 +46280,10 @@ index 2ed5fb217c6b9f63f28d760e2a2c00b29942315a..850abcd174cc8773319639c7e337f2e2
              <options>loggingcomponents</options>
              <delimiter>,</delimiter>
 
-From ba0799b8adc0eba075b0b90c86f6670398b65f45 Mon Sep 17 00:00:00 2001
+From a0543043a26699a0e4a8bed989481ab1320e3f0c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 21 Dec 2015 22:17:25 +0000
-Subject: [PATCH 30/71] [omximage] Fall back to arm jpeg encode/decode when gpu
+Subject: [PATCH 30/75] [omximage] Fall back to arm jpeg encode/decode when gpu
  is busy
 
 ---
@@ -46442,10 +46526,10 @@ index a93aa82663903fb1bf712058c2e259290ee742e6..6f38dbc7e5cc721c59a3633935f08218
  
  extern COMXImage g_OMXImage;
 
-From 8d5f8aa788c54c4cf0d1f448d322217b94d7eb29 Mon Sep 17 00:00:00 2001
+From 72ad7c69c3f847ade231f29ac23ffb96ebaf2ae4 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 9 Dec 2015 13:31:14 +0000
-Subject: [PATCH 31/71] [mmalcodec] Fail to open when width is invalid. Can
+Subject: [PATCH 31/75] [mmalcodec] Fail to open when width is invalid. Can
  happen with mpegts files
 
 ---
@@ -46467,10 +46551,10 @@ index 822b7bf75f2e732b5eed8687403d0eda503fa641..c43952d4d29b42f3a5c7605573294568
    if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
      return false;
 
-From 6b2cc20d5a1733f1bd97b46bf938b9b57904ac2c Mon Sep 17 00:00:00 2001
+From 0e735b38e2891c582c5a37dc5ded26cb954948a8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 Sep 2014 11:54:49 +0100
-Subject: [PATCH 32/71] [videoplayer/rbp] Add pi specific option to maintain
+Subject: [PATCH 32/75] [videoplayer/rbp] Add pi specific option to maintain
  vsync with pll adjustment
 
 New A/V sync option in settings/video/playback to do "Adjust PLL".
@@ -46911,10 +46995,10 @@ index fffa5182126159f6dfcf750b21fa0464e229e545..815d758e7086d73b4d4eb16849fdbb50
  
  extern CRBP g_RBP;
 
-From 57021f87ad5adaa6d559a5a59e4f07469289f578 Mon Sep 17 00:00:00 2001
+From d4a5c46043ced09c53dea24e6ca090a574806e3b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 15:35:43 +0100
-Subject: [PATCH 33/71] rbp: Support zero copy interface with hevc acceleration
+Subject: [PATCH 33/75] rbp: Support zero copy interface with hevc acceleration
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 9 +++++++++
@@ -46958,10 +47042,10 @@ index 77ae3273bc8e224fe6c193300ccef32fb7fbafe1..c0b3f19f2ef9cdef9adf00cf81154803
    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
      CLog::Log(LOGDEBUG, "%s::%s - mmal:%p dts:%.3f pts:%.3f buf:%p gpu:%p", CLASSNAME, __FUNCTION__, picture->MMALBuffer->mmal_buffer, 1e-6*picture->dts, 1e-6*picture->pts, picture->MMALBuffer, gmem);
 
-From b193395bcc84c1954a89811d565bfac787e6315a Mon Sep 17 00:00:00 2001
+From 0fbf365c6de020f0d094c8ab221b159593eecce5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 16 May 2015 18:26:04 +0100
-Subject: [PATCH 34/71] ffmpeg: use upstream mvc patches
+Subject: [PATCH 34/75] ffmpeg: use upstream mvc patches
 
 ---
  ...vcodec-add-h264_mvc-codec-id-and-profiles.patch |  68 ++++++++++++
@@ -47271,10 +47355,10 @@ index 0000000000000000000000000000000000000000..b39480ad098b9cd0882fcf75b96afb1b
 +2.7.4
 +
 
-From 2270fbeb9d9d858e15d77347f50e4813c75d4aff Mon Sep 17 00:00:00 2001
+From f303faf857227cee88db21f5e95bd0a7d2f8c06e Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 29 Jan 2016 17:18:50 +0300
-Subject: [PATCH 35/71] [win32] Settings: Added setting to enable/disable MVC
+Subject: [PATCH 35/75] [win32] Settings: Added setting to enable/disable MVC
  decoder.
 
 ---
@@ -47304,10 +47388,10 @@ index a017d30c24232fb01220b87b29398403b8ed9662..2fcee72a64e8b701c8e895143410bbe9
      <category id="display">
        <group id="1">
 
-From 2a51cc049289ca6c012ce2f09313ca13266fc37e Mon Sep 17 00:00:00 2001
+From 9f1937bc8941347695d09078e624cc30beab4a6d Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Wed, 20 Jan 2016 17:02:16 +0300
-Subject: [PATCH 36/71] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
+Subject: [PATCH 36/75] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
  streams.
 
 ---
@@ -47370,10 +47454,10 @@ index 54a18c669a058b705e0276cb7e14522ae6cd04ae..55431978dcfabee8da95e2e76292ff81
        }
      case AVMEDIA_TYPE_DATA:
 
-From 6a2a77a44d394e51330f10da92d2989171ff99b3 Mon Sep 17 00:00:00 2001
+From a451efc2d79422565ef1cbf931444c3ef5165125 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 25 Feb 2016 11:21:25 +0300
-Subject: [PATCH 37/71] [Stereo3D] Added block_lr and block_rl to supported
+Subject: [PATCH 37/75] [Stereo3D] Added block_lr and block_rl to supported
  modes.
 
 ---
@@ -47423,10 +47507,10 @@ index 1443acaf0f25df458ae49766e13dd0323454f2eb..6aaa82f4d883b8cae0ccdedf6c5a6814
      i++;
    }
 
-From a27d42b9f6b66f08be5561f6224ffb5af56fe38c Mon Sep 17 00:00:00 2001
+From 39522c63603fb5bf00b95a0eba5df6a626ea240f Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Sat, 23 Jan 2016 10:21:32 +0300
-Subject: [PATCH 38/71] [VideoPlayer] Fix possible wrong aspect.
+Subject: [PATCH 38/75] [VideoPlayer] Fix possible wrong aspect.
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerVideo.cpp | 2 +-
@@ -47446,10 +47530,10 @@ index 903f0d83527d9088ff1bf0ba056f357f6abfda81..a5a33d34c70892cde77ad4d8f3cb65fd
    else
      m_fForcedAspectRatio = 0.0;
 
-From 61f6644450fcca90960efbdfcbb619d79b46772f Mon Sep 17 00:00:00 2001
+From b362a9d5e20db180bc6fce923188a921e7a0e985 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 22 Jan 2016 18:18:33 +0300
-Subject: [PATCH 39/71] [VideoPlayer] DemuxFFmpeg: ssif remux
+Subject: [PATCH 39/75] [VideoPlayer] DemuxFFmpeg: ssif remux
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/CMakeLists.txt  |   2 +
@@ -47883,10 +47967,10 @@ index cca5c7f932241d146291d2bb0a0042f99fa0d596..edbc96f7be3ae4dae994320f8c137555
    m_discStubExtensions = ".disc";
    // internal music extensions
 
-From 8e01d2a2a958030fae5173fbcf6a14c8ae1997c6 Mon Sep 17 00:00:00 2001
+From 0bd2f0f4af5d90cd685380e36379590a378d024d Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:02:46 +0300
-Subject: [PATCH 40/71] [3DBD] Added support of 3D-BluRay playback.
+Subject: [PATCH 40/75] [3DBD] Added support of 3D-BluRay playback.
 
 ---
  lib/DllLibbluray.h                                 |   8 +
@@ -48876,10 +48960,10 @@ index b967a85e6557e42a7f1235cdd804d5a0263b866f..561fb5cd4f971bc9ee4f41218a60bb3d
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 0e7e3baf46d1c699dd14b492d81cd11ec656fe69 Mon Sep 17 00:00:00 2001
+From 913cd365b12a9730cb04bb8a9d5ebddde02d5503 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 11 Mar 2016 16:58:53 +0300
-Subject: [PATCH 41/71] [VideoPlayer] HasVideo returns true if video stream
+Subject: [PATCH 41/75] [VideoPlayer] HasVideo returns true if video stream
  exists. This don't allow start visualization if audio is opened before video.
 
 ---
@@ -48887,7 +48971,7 @@ Subject: [PATCH 41/71] [VideoPlayer] HasVideo returns true if video stream
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index f206847aa8bd9e57c9e558362ef0728fd7737efd..b533aa5395dac512d3b153b44b86d2fa7276ddb2 100644
+index 0285de264b4abc9433d70ae056b80c3db4b318c9..b244a21ac083c6f7b0e2d455e2b7a45fb2497640 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 @@ -3074,7 +3074,7 @@ void CVideoPlayer::Pause()
@@ -48900,10 +48984,10 @@ index f206847aa8bd9e57c9e558362ef0728fd7737efd..b533aa5395dac512d3b153b44b86d2fa
  
  bool CVideoPlayer::HasAudio() const
 
-From 8615e56935a181d5c85e56ca16854f197f5a39cd Mon Sep 17 00:00:00 2001
+From e8a09603950b958dd1934cb460fda960759485f8 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 42/71] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 42/75] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -48940,10 +49024,10 @@ index 3a080d06c90b0762482816928642e6de7810b539..a8323f419e404037c4e5fb4d78fa1b45
      CDVDOverlayImage* overlay = new CDVDOverlayImage();
  
 
-From 6b3a976f6e558e23d9561ea37ac5e5e59eb5b801 Mon Sep 17 00:00:00 2001
+From f10689878e33dc69a2ebbd559f41de12e72784c5 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 7 Apr 2016 17:28:50 +0300
-Subject: [PATCH 43/71] [VideoPlayer] Disable reading extension stream from
+Subject: [PATCH 43/75] [VideoPlayer] Disable reading extension stream from
  input stream if decoder doesn't support it.
 
 ---
@@ -49173,7 +49257,7 @@ index 0b676c9b611fe956f1aa721013412e41ff5b62f6..6762e733848d1298a75a862b0aaf81aa
  
  class CDVDAudioCodec;
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index b533aa5395dac512d3b153b44b86d2fa7276ddb2..505747a6d7c45c0a4e67fefa711c85dd5236e35d 100644
+index b244a21ac083c6f7b0e2d455e2b7a45fb2497640..69b031a5623888a1b9a8c0ca7fe34fe3b1900fdc 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 @@ -3802,6 +3802,10 @@ bool CVideoPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
@@ -49200,10 +49284,10 @@ index 0d4100e58e9db7e5035bcf9ae23b0147f80cec8f..69570153f0810a5840f3780c7a6681a1
    // classes
    CDVDOverlayContainer* m_pOverlayContainer;
 
-From fbe74a1f5eabeba77ead6a05a30a2c4e2b2ca283 Mon Sep 17 00:00:00 2001
+From 74d399ad03a76c6f63c4fab2ba8ba2760a2f2180 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 16 Sep 2016 11:37:48 +0300
-Subject: [PATCH 44/71] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
+Subject: [PATCH 44/75] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
  platform settings to common settings.
 
 ---
@@ -49299,10 +49383,10 @@ index 473ca093f45f6a5779cade1268269bb7ba483e9d..11a422b1a5cbfde9914d3bfd23b5b540
    m_simpleConditions.insert("have_lcms2");
  #endif
 
-From 79606a9af5952398a535b203d34eab88189b75bf Mon Sep 17 00:00:00 2001
+From 1f0f86550e8cfed2a5de0d436c5c1e1e2ea642a1 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 4 Nov 2016 22:56:56 +0300
-Subject: [PATCH 45/71] [VideoPlayer] SSIF: fix for corner case when mvc stream
+Subject: [PATCH 45/75] [VideoPlayer] SSIF: fix for corner case when mvc stream
  is switched before the last packet is read from previous stream.
 
 ---
@@ -49491,33 +49575,33 @@ index f70657c9e31fb2460d12910c635dba5163282e74..a11ec77903d2a9b2c68106a8e2301af9
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 9134b76ed4f3e94794b24624b5251d03c57c2d16 Mon Sep 17 00:00:00 2001
+From ddc42633af64cfc6e9447d40f988c86a9a04250d Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:01:08 +0300
-Subject: [PATCH 46/71] [libbluray] bump libbluray to 0.9.2-mvc.
+Subject: [PATCH 46/75] [libbluray] bump libbluray to 0.9.2-mvc.
 
 ---
  project/BuildDependencies/scripts/0_package.list | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/project/BuildDependencies/scripts/0_package.list b/project/BuildDependencies/scripts/0_package.list
-index 33f87aec9968a24f5c9ba35ab5ea2eb1786feaf9..4fdebd9401b40ca18a474aece3e27f4b696c9d5b 100644
+index 6f53a2785027cf6c34d084402f3f1aee7cf5860a..e4a67e91b0a6b9fafad972b0f6f8e86c619c436f 100644
 --- a/project/BuildDependencies/scripts/0_package.list
 +++ b/project/BuildDependencies/scripts/0_package.list
 @@ -17,7 +17,7 @@ freetype-db5a22-win32-vc140.7z
  giflib-5.1.4-win32-vc140.7z
  jsonschemabuilder-1.0.0-win32-3.7z
- libass-ddb383-win32-vc140.7z
+ libass-d18a5f1-win32-vc140.7z
 -libbluray-0.9.3-win32-vc140.7z
 +libbluray-0.9.2-mvc-win32-vc120.7z
  libcdio-0.9.3-win32-vc140.7z
  libcec-4.0.1-win32-vc140-2.7z
  libfribidi-0.19.2-win32.7z
 
-From cab608dce138c7ac52f9acb37945a6d2bbe9a523 Mon Sep 17 00:00:00 2001
+From 30060bc20c7f25701009d77d6b566e26ef77fa14 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 29 Feb 2016 17:00:50 +0000
-Subject: [PATCH 47/71] libbluray: Bump to Nevcairie's v0.9.2
+Subject: [PATCH 47/75] libbluray: Bump to Nevcairie's v0.9.2
 
 This includes 3D support
 ---
@@ -51174,10 +51258,10 @@ index 0000000000000000000000000000000000000000..5ef0124e35c9d81143921a328e272220
 + 
 +     return fp;
 
-From 336c8898720e5c9f50115b1a359188b44f1fec11 Mon Sep 17 00:00:00 2001
+From d3ad5d1c9d8da1ee7c63cd9302bef058b1da1135 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 6 Mar 2016 12:54:59 +0000
-Subject: [PATCH 48/71] mvc: Automatically enable stereo mode
+Subject: [PATCH 48/75] mvc: Automatically enable stereo mode
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.cpp | 6 +++++-
@@ -51235,10 +51319,10 @@ index 311dd6689236d660919c4c4483c51dca2752514a..536332c43e22ccb229e72b88518e54dd
      break;
      case AV_CODEC_ID_MPEG4:
 
-From e8fc139cb043e1718a8cf8e348fefcc4d00f9acf Mon Sep 17 00:00:00 2001
+From f1b065ebbb0f130da3e28a6a4375f9458cee3fd3 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 24 Mar 2016 13:02:58 +0000
-Subject: [PATCH 49/71] ffmpeg: mvc: fix for pixelation from packets with no
+Subject: [PATCH 49/75] ffmpeg: mvc: fix for pixelation from packets with no
  pts/dts
 
 ---
@@ -51300,10 +51384,10 @@ index 7e97e4d91a443d46d933df528763422ff5e8f4fa..d4f279fd4f2ceb260698cd6fedb124ba
  	cd $(PLATFORM);\
  	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
 
-From 5f2316bdcc751de483d4a52eee31c1c1786469a9 Mon Sep 17 00:00:00 2001
+From 332a8c9c8739a159f62542856c686ee14e996bdd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 11 Nov 2016 15:53:53 +0000
-Subject: [PATCH 50/71] stereoscopicmanager: fixups for rbp
+Subject: [PATCH 50/75] stereoscopicmanager: fixups for rbp
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/DVDCodecUtils.cpp | 61 ++++++++++++++++++++++
@@ -51541,10 +51625,10 @@ index 6aaa82f4d883b8cae0ccdedf6c5a6814e7aaa720..cc929b599125a44ac128713fd4331782
  };
  
 
-From 32b9a9ab2a9ab92008ae9cc6250b6b898de804f8 Mon Sep 17 00:00:00 2001
+From 2d81f94dcaf52e951bb7e203ea248b48c24d15aa Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 51/71] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 51/75] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -51564,10 +51648,10 @@ index a8323f419e404037c4e5fb4d78fa1b45409337a7..7c0b70777556ac7694e7fc511cd4bb18
    }
  
 
-From 352b7f1fac766e04179adaf308ad544b31b604cb Mon Sep 17 00:00:00 2001
+From 48664856527a85a6d242649a5dcebf85d9420171 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Nov 2016 18:24:18 +0000
-Subject: [PATCH 52/71] DemuxMVC: fixup after SeekTime API change
+Subject: [PATCH 52/75] DemuxMVC: fixup after SeekTime API change
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DemuxMVC.cpp | 2 +-
@@ -51601,10 +51685,36 @@ index bbb836a61344689a83af68c821c05c212a86b097..54f91a02391368fbfbb4d669c003f425
    virtual int GetStreamLength() { return 0; };
    virtual CDemuxStream* GetStream(int iStreamId) const override { return nullptr; };
 
-From 6a1debd2bc377a5d68fbed8c0c134898cc28e4e1 Mon Sep 17 00:00:00 2001
+From 945b547c444e7ec5039c88e31b612c57b25edd1b Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 3 Nov 2014 23:17:46 +0000
+Subject: [PATCH 53/75] [cec] Don't discard buttons when repeat mode is enabled
+
+---
+ xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+index 30367a3fde956090afdca9930fa52e829f35046f..febacb3b7964eab3b8615a6a807e0f27d911b4da 100644
+--- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
++++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
+@@ -803,7 +803,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
+   CLog::Log(LOGDEBUG, "%s - received key %2x duration %d", __FUNCTION__, key.iButton, key.iDuration);
+ 
+   CSingleLock lock(m_critSection);
+-  if (key.iDuration > 0)
++  // avoid the queue getting too long
++  if (m_configuration.iButtonRepeatRateMs && m_buttonQueue.size() > 5)
++    return;
++  if (m_configuration.iButtonRepeatRateMs == 0 && key.iDuration > 0)
+   {
+     if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
+     {
+
+From 70d24188f34e2846d42f18146baf43952c31aae3 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 4 Nov 2014 18:50:00 +0000
-Subject: [PATCH 54/71] [cec] Temp - more logging
+Subject: [PATCH 54/75] [cec] Temp - more logging
 
 ---
  xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 8 +++++++-
@@ -51656,10 +51766,10 @@ index febacb3b7964eab3b8615a6a807e0f27d911b4da..52d6e6a7ab68ce91faf5a3881b23ea7a
  }
  
 
-From 307a15f3b87951d023c06b73f0116dd8af1c9382 Mon Sep 17 00:00:00 2001
+From 0d75b80f8862d67a4edc9f769acc0d18448ad268 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 25 May 2016 18:31:17 +0100
-Subject: [PATCH 55/71] rbp: Hard code the number of buffers to improve audio
+Subject: [PATCH 55/75] rbp: Hard code the number of buffers to improve audio
  sync
 
 ---
@@ -51701,10 +51811,10 @@ index fd8a0a2447c40357a9e13003f2ef45ef20ccb205..be0de0d962fd374bc17bfa48a27ca17d
  
  }
 
-From e384deac300900920e7cc9fd489e487dc63668ef Mon Sep 17 00:00:00 2001
+From b7bcc39b920c47e7c4273895feae92d4a82ba08f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 4 Jul 2016 18:30:03 +0100
-Subject: [PATCH 56/71] rbp: Update the GL libs to new naming scheme
+Subject: [PATCH 56/75] rbp: Update the GL libs to new naming scheme
 
 As the opensource mesa GL library is getting more usable, the name collision wih the firmware GL driver is causing issues.
 As such we are renaming the firmware GL driver to avoid this.
@@ -51718,7 +51828,7 @@ will be dropped at some point
  3 files changed, 5 insertions(+), 5 deletions(-)
 
 diff --git a/configure.ac b/configure.ac
-index 9bd8d6ec09ff5cf0c6e6caf39850f650f1dd2665..291ff72c1845037f97e215232ab1c2667687f289 100644
+index cbaefbe0a6a42f7d863800d87281a3f680cfea5b..2329e126f807b3eccb8cfd4e6ef3117ec20c85b5 100644
 --- a/configure.ac
 +++ b/configure.ac
 @@ -949,7 +949,7 @@ if test "$use_gles" = "yes"; then
@@ -51769,10 +51879,10 @@ index 3626ea5204eb561dc1ae0b64c6bb7253d2ec59ec..100ff3178bafe7434bd5456100b5bb71
  fi
  
 
-From 03bbec2df7c5901415ba5496245ed25ba5841181 Mon Sep 17 00:00:00 2001
+From e63ee8ac3fd87a12bdcf197827a182043e58b4af Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 28 Jun 2016 14:46:01 +0100
-Subject: [PATCH 57/71] ffmpeg: hacky fix for files with GMC
+Subject: [PATCH 57/75] ffmpeg: hacky fix for files with GMC
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 4 ++--
@@ -51794,10 +51904,10 @@ index 9149698884c8ae6a23649abbaa0e659587dfe982..84d515e9e2df6a4c1c448a52a42f4675
          {
            if (pStream->codec->codec_id == AV_CODEC_ID_PROBE)
 
-From 3d5d1b8ef8d74c6c4a41cf7e654c7435f4fe52eb Mon Sep 17 00:00:00 2001
+From 73498b227b428c32c7e5ebc5623d094020fe98a7 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 19 Jul 2016 20:39:18 +0100
-Subject: [PATCH 58/71] mmalrender: Add sharpness control
+Subject: [PATCH 58/75] mmalrender: Add sharpness control
 
 ---
  addons/resource.language.en_gb/resources/strings.po         |  2 +-
@@ -51869,10 +51979,10 @@ index e0e6f7c0e0546013ca74265aef54704fd332f8e4..69eae6cbef0131d20dc979dcb35915cd
    CCriticalSection m_sharedSection;
    MMAL_COMPONENT_T *m_vout;
 
-From 5375fb0e6dc4ba69f348d0ded98ff36c2084e47e Mon Sep 17 00:00:00 2001
+From 57c94de16036e00a6822e374cc8ebbc8a042dc6b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 14 Oct 2016 15:37:53 +0100
-Subject: [PATCH 59/71] MMALFFMpeg: Report as SW decode in codec overlay info
+Subject: [PATCH 59/75] MMALFFMpeg: Report as SW decode in codec overlay info
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 2 +-
@@ -51892,10 +52002,10 @@ index 8bace5b3eb98b3b1ddad7f56af83a41ae067bc75..c820a04c903866862b5ff04b38124ff0
    CLog::Log(LOGDEBUG, "CDVDVideoCodecFFmpeg - Updated codec: %s", m_name.c_str());
  }
 
-From 71184cfffaf0216c67b9dd3c600d5c8d805e984a Mon Sep 17 00:00:00 2001
+From 43c6b165b6d0f754f938d54bba00655d436679fd Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Nov 2016 18:28:01 +0000
-Subject: [PATCH 60/71] advancedsettings: Add option to set cache size on
+Subject: [PATCH 60/75] advancedsettings: Add option to set cache size on
  libass
 
 E.g to set total cache size in libass to 32M
@@ -51997,10 +52107,10 @@ index 6b0e3b8cf9e3ff40e6af758c54fe7eefb89a131c..35bf38719f0eaaa5ac29e9495480ae97
      unsigned int m_jsonTcpPort;
  
 
-From 147c688154b43f57ec048166d901920594d70c28 Mon Sep 17 00:00:00 2001
+From 84623dff0ea921cf494fb9f15379b1bbc43844a0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 13 Nov 2016 20:30:15 +0000
-Subject: [PATCH 61/71] [rbp] Experimental limit libass cache size depending on
+Subject: [PATCH 61/75] [rbp] Experimental limit libass cache size depending on
  arm memory size
 
 ---
@@ -52046,10 +52156,10 @@ index 7f3325392993823b8d2d6a915579c48401ca2c12..410ad30aeb60316e9438ee56aaca7e73
    m_libAssCache = 0;
  
 
-From f4d6563f06f5c674a8b845daca0c58913a0f1712 Mon Sep 17 00:00:00 2001
+From b5d95824c6e029b58aaf3b1d6dd2774661925096 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 22 Jun 2015 21:46:57 +0100
-Subject: [PATCH 62/71] [rbp] Use default resampling setting on Pi2
+Subject: [PATCH 62/75] [rbp] Use default resampling setting on Pi2
 
 ---
  system/settings/rbp2.xml | 5 +++++
@@ -52072,10 +52182,10 @@ index 50bd55e9c90864c1ff4c36c4650e9ec247737a44..f218216e615d9723e5a163aab9c42ca5
    </section>
  </settings>
 
-From cb0855a5d1f3d8d64c3d780eee4d36a8cae7e460 Mon Sep 17 00:00:00 2001
+From c6165dc89c629abd2583eb7181e0543d6b69c255 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 1 Dec 2016 17:06:01 +0000
-Subject: [PATCH 63/71] MMALRender: Allow advanced deinterlace with software
+Subject: [PATCH 63/75] MMALRender: Allow advanced deinterlace with software
  decode
 
 Uses YUV420 directly which improves performance.
@@ -52098,10 +52208,10 @@ index f5f0f0d01227b3b4dcebb4a22a54dbcaac2d5ee9..05cbd8eeaef1a21fc32ea1fa23ea686e
      status = mmal_port_format_commit(m_deint_output);
      if (status != MMAL_SUCCESS)
 
-From 92cf602789202ca31230bba940bb2e2e551ecbeb Mon Sep 17 00:00:00 2001
+From 15e9791cb79c6c3b5f8c09bba979761451bea04c Mon Sep 17 00:00:00 2001
 From: Nuno Senica <nsenica@gmail.com>
 Date: Tue, 27 Dec 2016 20:59:56 +0000
-Subject: [PATCH 64/71] Apply ffmpeg patches automatically after downloading
+Subject: [PATCH 64/75] Apply ffmpeg patches automatically after downloading
  and extracting the ffmpeg tar ball
 
 ---
@@ -52132,10 +52242,10 @@ index 7c68b4c3d09a037d3b85c81604d47a7ea6dd1c21..eec635ef493d13ea97c9b806eb57cccb
    file(WRITE ${CMAKE_BINARY_DIR}/${CORE_BUILD_DIR}/ffmpeg/ffmpeg-link-wrapper
  "#!/bin/bash
 
-From c1f3b380540377b5b710c46066aa79fa64bc696e Mon Sep 17 00:00:00 2001
+From 358df1970de1f6f107e1681785ed723db0756f0e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 1 May 2016 19:56:43 +0100
-Subject: [PATCH 65/71] omxplayer: Avoid CAEFactory::Suspend which should only
+Subject: [PATCH 65/75] omxplayer: Avoid CAEFactory::Suspend which should only
  be called by application
 
 ---
@@ -52235,10 +52345,10 @@ index db7f98ddbc2db2f20bdc42379df3f08eba165bfc..02acfc8cfe57446be4e00b991ef6fde9
    COMXCoreComponent m_omx_render_analog;
    COMXCoreComponent m_omx_render_hdmi;
 
-From 1f20fe91f925c482201feb608b254c4afd235459 Mon Sep 17 00:00:00 2001
+From dd69c1880f97b81981df1ad50f09bfb457ad8532 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 1 Mar 2017 21:40:22 +0000
-Subject: [PATCH 66/71] MMALRender: default to bob (x2) deinterlace for HD
+Subject: [PATCH 66/75] MMALRender: default to bob (x2) deinterlace for HD
 
 There are still issues with some dvb dongles run on the same Pi as playback.
 Default to bob. Users who aren't using these devices will have to manually enable advanced.
@@ -52280,10 +52390,10 @@ index 39bc0530cecd54ae8c3a5481c92f1a6a18a4d9c5..cb0a06888a919879155fea2a689c1bae
    if (m_deinterlace && interlace_method != VS_INTERLACEMETHOD_NONE)
    {
 
-From 8305dec169baacde92f6d2ca4f791d3f281fa67d Mon Sep 17 00:00:00 2001
+From b96bf65f71bca91e4e029ed64c7e3dc86c0d0dad Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 17 Feb 2017 17:58:13 +0000
-Subject: [PATCH 67/71] ffmpeg: Update hevc optimisation to use the gpu service
+Subject: [PATCH 67/75] ffmpeg: Update hevc optimisation to use the gpu service
 
 ---
  project/cmake/modules/FindFFMPEG.cmake             |    14 +-
@@ -105595,10 +105705,10 @@ index e172ebf157aebffe1ae50b4a2b25fd71bc708c93..852815d5f4ae80771c5304f6f3520b5e
 ++
 ++
 
-From e6a8a101454e409fd2b6c61324d26252541b6d29 Mon Sep 17 00:00:00 2001
+From 1ec8569a01645467680e3090afba9927cea120d0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 4 Mar 2017 19:25:40 +0000
-Subject: [PATCH 68/71] ffmpeg: Call get_format to fix an issue with MMAL
+Subject: [PATCH 68/75] ffmpeg: Call get_format to fix an issue with MMAL
  rendering
 
 ---
@@ -105720,10 +105830,10 @@ index 3d970429012c1f3aede4df0545ced5006c165d50..e070d96fc340f5bff94d72ae9004c4a9
  CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
  ./configure --prefix=$FFMPEG_PREFIX \
 
-From 63c5c8ffa14cc3869411b7ba30198362c7a070d1 Mon Sep 17 00:00:00 2001
+From b230c015d539db71bb2eb04232b25805703014c6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Mar 2017 22:52:37 +0000
-Subject: [PATCH 69/71] MMAL: Remove periodic prime calls and handle from
+Subject: [PATCH 69/75] MMAL: Remove periodic prime calls and handle from
  buffer destructor
 
 If a number of buffers are released at once we can end up stalled in GetPicture with the buffers
@@ -105817,10 +105927,10 @@ index 9279966fa634f6f5a3e00f12dd528337392cf038..c6ba9b024b3c3bbe53d3f0870dd8c839
    CLog::Log(LOGDEBUG, "%s::%s - stopping", CLASSNAME, __func__);
  }
 
-From 4ce3f65bb006dfa4b8d49b58b257728848a9e8dd Mon Sep 17 00:00:00 2001
+From 6f29617ca776bb2e6459a55710a4a569311c8d7e Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Mar 2017 23:22:43 +0000
-Subject: [PATCH 70/71] MMALCodec: Include a timeout of GetPicture in default
+Subject: [PATCH 70/75] MMALCodec: Include a timeout of GetPicture in default
  debug logging
 
 ---
@@ -105841,10 +105951,10 @@ index 22d594cdc217f32f820e3618b4d90a1d75fc769b..e8bc3b930e84e058460b6cfd7caca0d7
  
    return ret;
 
-From 472ede7c96085b80de6779bebd0bbd3482c3b02d Mon Sep 17 00:00:00 2001
+From a3185132fc1828162ad59e09155464b26a7f35b0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 21 Mar 2017 20:15:55 +0000
-Subject: [PATCH 71/71] ffmpeg: Add calls to init and deinit gpu service
+Subject: [PATCH 71/75] ffmpeg: Add calls to init and deinit gpu service
 
 ---
  tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch | 6 ++++--
@@ -105883,3 +105993,15067 @@ index 852815d5f4ae80771c5304f6f3520b5e49b18a67..b4c15b782a4deb36c35a006e8547ce69
  +  vcsm_exit();
  +
  +  mbox_close(mb);
+
+From 9ef1f2fdde0e49ae3c5da03defa83d32ab2e432d Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Mon, 27 Mar 2017 20:06:42 +0100
+Subject: [PATCH 72/75] squash: ffmpeg: hevc: Remove rules that require qasm
+
+---
+ tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch | 12 ------------
+ 1 file changed, 12 deletions(-)
+
+diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+index b4c15b782a4deb36c35a006e8547ce69665a10fe..58379fb0874521205184c53be5aae893cfd39d49 100644
+--- a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+@@ -308,18 +308,6 @@ index fd0d1f0..40d22d2 100644
+         vorbis_parser.o                                                  \
+         xiph.o                                                           \
+  
+-@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+- $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+- $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
+- endif
+-+
+-+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+-+	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+-+
+-+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+-+	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+-+
+-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
+ diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+ index 54efaad..02a89c3 100644
+ --- a/libavcodec/allcodecs.c
+
+From 38a49f21a7430779830d9d4e2468e76de6faf92c Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 19 May 2017 15:11:37 +0100
+Subject: [PATCH 73/75] RBP: Add api to query gpu frame geometry
+
+---
+ xbmc/linux/RBP.cpp | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ xbmc/linux/RBP.h   | 12 +++++++
+ 2 files changed, 108 insertions(+), 1 deletion(-)
+
+diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
+index 238eba372af2cbab11d7543c857ee47640901d13..79f932378cf37747be79e65fd0c2e2476f95474f 100644
+--- a/xbmc/linux/RBP.cpp
++++ b/xbmc/linux/RBP.cpp
+@@ -27,6 +27,7 @@
+ #include "utils/log.h"
+ 
+ #include "cores/omxplayer/OMXImage.h"
++#include <interface/mmal/mmal.h>
+ 
+ #include <sys/ioctl.h>
+ #include "rpi/rpi_user_vcsm.h"
+@@ -39,6 +40,41 @@
+ static int mbox_open();
+ static void mbox_close(int file_desc);
+ 
++typedef struct vc_image_extra_uv_s {
++   void *u, *v;
++   int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++   VC_IMAGE_EXTRA_UV_T uv;
++} VC_IMAGE_EXTRA_T;
++
++struct VC_IMAGE_T {
++   unsigned short                  type;           /* should restrict to 16 bits */
++   unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
++   unsigned short                  width;          /* width in pixels */
++   unsigned short                  height;         /* height in pixels */
++   int                             pitch;          /* pitch of image_data array in bytes */
++   int                             size;           /* number of bytes available in image_data array */
++   void                           *image_data;     /* pixel data */
++   VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
++   void                           *metadata;       /* metadata header for the image */
++   void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
++   uint32_t                        mem_handle;     /* the mem handle for relocatable memory storage */
++   int                             metadata_size;  /* size of metadata of each channel in bytes */
++   int                             channel_offset; /* offset of consecutive channels in bytes */
++   uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++   uint8_t                         num_channels;   /* number of channels (2 for stereo) */
++   uint8_t                         current_channel;/* the channel this header is currently pointing to */
++   uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++   uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
++                                                             into a linked-mulitchannel image */
++   uint8_t                         channel_index;         /* index of the channel this header represents while
++                                                             it is being linked. */
++   uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
++};
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
+ CRBP::CRBP()
+ {
+   m_initialized     = false;
+@@ -322,7 +358,7 @@ static unsigned mem_lock(int file_desc, unsigned handle)
+    return p[5];
+ }
+ 
+-unsigned mem_unlock(int file_desc, unsigned handle)
++static unsigned mem_unlock(int file_desc, unsigned handle)
+ {
+    int i=0;
+    unsigned p[32];
+@@ -341,6 +377,32 @@ unsigned mem_unlock(int file_desc, unsigned handle)
+    return p[5];
+ }
+ 
++
++#define GET_VCIMAGE_PARAMS 0x30044
++static int get_image_params(int file_desc, VC_IMAGE_T * img)
++{
++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++    uint32_t * p = buf;
++    void * rimg;
++    int rv;
++
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = GET_VCIMAGE_PARAMS;
++    *p++ = sizeof(*img);
++    *p++ = sizeof(*img);
++    rimg = p;
++    memcpy(p, img, sizeof(*img));
++    p += sizeof(*img) / sizeof(*p);
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
++
++    rv = mbox_property(file_desc, buf);
++    memcpy(img, rimg, sizeof(*img));
++
++    return rv;
++}
++
+ CGPUMEM::CGPUMEM(unsigned int numbytes, bool cached)
+ {
+   m_numbytes = numbytes;
+@@ -372,6 +434,39 @@ void CGPUMEM::Flush()
+   vcsm_clean_invalid( &iocache );
+ }
+ 
++AVRpiZcFrameGeometry CRBP::GetFrameGeometry(uint32_t encoding, unsigned short video_width, unsigned short video_height)
++{
++  AVRpiZcFrameGeometry geo = {};
++  struct VC_IMAGE_T img = {};
++
++  if (encoding == MMAL_ENCODING_YUVUV128)
++  {
++    img.type = VC_IMAGE_YUV_UV;
++    img.width = video_width;
++    img.height = video_height;
++    int rc = get_image_params(GetMBox(), &img);
++    assert(rc == 0);
++    const unsigned int stripe_w = 128;
++    geo.stride_y = stripe_w;
++    geo.stride_c = stripe_w;
++    geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++    geo.height_c = img.pitch / stripe_w - geo.height_y;
++    geo.planes_c = 1;
++    geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++  }
++  else if (encoding == MMAL_ENCODING_I420)
++  {
++    geo.stride_y = (video_width + 31) & ~31;
++    geo.stride_c = geo.stride_y / 2;
++    geo.height_y = (video_height + 15) & ~15;
++    geo.height_c = geo.height_y / 2;
++    geo.planes_c = 2;
++    geo.stripes = 1;
++  }
++  else assert(0);
++  return geo;
++}
++
+ double CRBP::AdjustHDMIClock(double adjust)
+ {
+   char response[80];
+diff --git a/xbmc/linux/RBP.h b/xbmc/linux/RBP.h
+index 815d758e7086d73b4d4eb16849fdbb509a3c251d..a7f07403854b81996cca72eff82e3a7d591c9209 100644
+--- a/xbmc/linux/RBP.h
++++ b/xbmc/linux/RBP.h
+@@ -41,6 +41,17 @@
+ #include "threads/CriticalSection.h"
+ #include "threads/Event.h"
+ 
++
++typedef struct AVRpiZcFrameGeometry
++{
++  unsigned int stride_y;
++  unsigned int height_y;
++  unsigned int stride_c;
++  unsigned int height_c;
++  unsigned int planes_c;
++  unsigned int stripes;
++} AVRpiZcFrameGeometry;
++
+ class CGPUMEM
+ {
+ public:
+@@ -82,6 +93,7 @@ public:
+   uint32_t WaitVsync(uint32_t target = ~0U);
+   void VSyncCallback();
+   int GetMBox() { return m_mb; }
++  AVRpiZcFrameGeometry GetFrameGeometry(uint32_t encoding, unsigned short video_width, unsigned short video_height);
+   double AdjustHDMIClock(double adjust);
+   double GetAdjustHDMIClock() { return m_actual_pll_adjust; }
+ 
+
+From 1856e86917eef62f3069c465d7c8ff2f8e213395 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 19 May 2017 15:12:28 +0100
+Subject: [PATCH 74/75] MMALFFmpeg: Add Sand/YUVUV128 support
+
+---
+ .../DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp        |  2 +-
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h |  3 ++
+ .../VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp     | 51 +++++++++++++++-------
+ 3 files changed, 39 insertions(+), 17 deletions(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+index c820a04c903866862b5ff04b38124ff0f7f7c17f..8444d0df598caef958e4ac3254419f3b4f95c513 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+@@ -241,7 +241,7 @@ enum AVPixelFormat CDVDVideoCodecFFmpeg::GetFormat(struct AVCodecContext * avctx
+ #endif
+ 
+ #ifdef HAS_MMAL
+-    if (*cur == AV_PIX_FMT_YUV420P)
++    if (*cur == AV_PIX_FMT_YUV420P || *cur == AV_PIX_FMT_SAND128)
+     {
+       MMAL::CDecoder* dec = new MMAL::CDecoder(ctx->m_processInfo, ctx->m_hints);
+       if(dec->Open(avctx, ctx->m_pCodecContext, *cur, ctx->m_uSurfacesCount))
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h
+index 1e49f09574c2a93b938d5eb405ebcb06543dec01..aecf0c54093092332b4a31a694472669cec84cb5 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.h
+@@ -41,6 +41,7 @@
+ #include "rendering/RenderSystem.h"
+ #include "cores/VideoPlayer/VideoRenderers/BaseRenderer.h"
+ #include "cores/VideoPlayer/DVDResource.h"
++#include "linux/RBP.h"
+ 
+ 
+ enum MMALState { MMALStateNone, MMALStateHWDec, MMALStateFFDec, MMALStateDeint, };
+@@ -60,6 +61,8 @@ public:
+   unsigned int m_height;
+   unsigned int m_aligned_width;
+   unsigned int m_aligned_height;
++  unsigned int m_size;
++  AVRpiZcFrameGeometry m_geo;
+   uint32_t m_encoding;
+   float m_aspect_ratio;
+   MMALState m_state;
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp
+index f9b7172c45d5a0158259ebfb53ea75696f0acb6d..456214a679779469ea52db7ce846a3871147f685 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALFFmpeg.cpp
+@@ -47,7 +47,6 @@ using namespace MMAL;
+ CMMALYUVBuffer::CMMALYUVBuffer(CDecoder *omv, std::shared_ptr<CMMALPool> pool, uint32_t mmal_encoding, uint32_t width, uint32_t height, uint32_t aligned_width, uint32_t aligned_height, uint32_t size)
+ : CMMALBuffer(pool), m_omv(omv)
+ {
+-  uint32_t size_pic = 0;
+   m_width = width;
+   m_height = height;
+   m_aligned_width = aligned_width;
+@@ -57,21 +56,18 @@ CMMALYUVBuffer::CMMALYUVBuffer(CDecoder *omv, std::shared_ptr<CMMALPool> pool, u
+   mmal_buffer = nullptr;
+   m_rendered = false;
+   m_stills = false;
+-  if (m_encoding == MMAL_ENCODING_I420)
+-    size_pic = (m_aligned_width * m_aligned_height * 3) >> 1;
+-  else if (m_encoding == MMAL_ENCODING_YUVUV128)
+-    size_pic = (m_aligned_width * m_aligned_height * 3) >> 1;
+-  else if (m_encoding == MMAL_ENCODING_ARGB || m_encoding == MMAL_ENCODING_RGBA || m_encoding == MMAL_ENCODING_ABGR || m_encoding == MMAL_ENCODING_BGRA)
+-    size_pic = (m_aligned_width << 2) * m_aligned_height;
+-  else if (m_encoding == MMAL_ENCODING_RGB16)
+-    size_pic = (m_aligned_width << 1) * m_aligned_height;
+-  else assert(0);
+-  if (size)
++
++  if (size == 0)
+   {
+-    assert(size_pic <= size);
+-    size_pic = size;
++    m_geo = g_RBP.GetFrameGeometry(m_encoding, aligned_width, aligned_height);
++    const unsigned int size_y = m_geo.stride_y * m_geo.height_y;
++    const unsigned int size_c = m_geo.stride_c * m_geo.height_c;
++    m_size = (size_y + size_c * m_geo.planes_c) * m_geo.stripes;
+   }
+-  gmem = m_pool->AllocateBuffer(size_pic);
++  else
++    m_size = size;
++  assert(m_size > 0);
++  gmem = m_pool->AllocateBuffer(m_size);
+   if (gmem)
+     gmem->m_opaque = (void *)this;
+   if (VERBOSE && g_advancedSettings.CanLogComponent(LOGVIDEO))
+@@ -155,6 +151,8 @@ int CDecoder::FFGetBuffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+   uint32_t mmal_format = 0;
+   if (dec->m_fmt == AV_PIX_FMT_YUV420P)
+     mmal_format = MMAL_ENCODING_I420;
++  else if (frame->format == AV_PIX_FMT_SAND128)
++    mmal_format = MMAL_ENCODING_YUVUV128;
+   else if (dec->m_fmt == AV_PIX_FMT_ARGB)
+     mmal_format = MMAL_ENCODING_ARGB;
+   else if (dec->m_fmt == AV_PIX_FMT_RGBA)
+@@ -178,7 +176,7 @@ int CDecoder::FFGetBuffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+ 
+   CSingleLock lock(dec->m_section);
+   CGPUMEM *gmem = YUVBuffer->gmem;
+-  AVBufferRef *buf = av_buffer_create((uint8_t *)gmem->m_arm, (YUVBuffer->m_aligned_width * YUVBuffer->m_aligned_height * 3)>>1, CDecoder::FFReleaseBuffer, gmem, AV_BUFFER_FLAG_READONLY);
++  AVBufferRef *buf = av_buffer_create((uint8_t *)gmem->m_arm, YUVBuffer->m_size, CDecoder::FFReleaseBuffer, gmem, AV_BUFFER_FLAG_READONLY);
+   if (!buf)
+   {
+     CLog::Log(LOGERROR, "%s::%s av_buffer_create() failed", CLASSNAME, __FUNCTION__);
+@@ -203,6 +201,27 @@ int CDecoder::FFGetBuffer(AVCodecContext *avctx, AVFrame *frame, int flags)
+     frame->data[1] = frame->data[0] + YUVBuffer->m_aligned_width * YUVBuffer->m_aligned_height;
+     frame->data[2] = frame->data[1] + (YUVBuffer->m_aligned_width>>1) * (YUVBuffer->m_aligned_height>>1);
+   }
++  else if (frame->format == AV_PIX_FMT_SAND128)
++  {
++    const unsigned int size_y = YUVBuffer->m_geo.stride_y * YUVBuffer->m_geo.height_y;
++    const unsigned int size_c = YUVBuffer->m_geo.stride_c * YUVBuffer->m_geo.height_c;
++
++    frame->buf[0] = buf;
++
++    frame->linesize[0] = YUVBuffer->m_geo.stride_y;
++    frame->linesize[1] = YUVBuffer->m_geo.stride_c;
++    frame->linesize[2] = YUVBuffer->m_geo.stride_c;
++    if (YUVBuffer->m_geo.stripes > 1)
++        frame->linesize[3] = YUVBuffer->m_geo.height_y + YUVBuffer->m_geo.height_c;      // abuse: linesize[3] = stripe stride
++
++    frame->data[0] = (uint8_t *)gmem->m_arm;
++    frame->data[1] = frame->data[0] + size_y;
++    if (YUVBuffer->m_geo.planes_c > 1)
++        frame->data[2] = frame->data[1] + size_c;
++
++    frame->extended_data = frame->data;
++    // Leave extended buf alone
++  }
+   else if (dec->m_fmt == AV_PIX_FMT_BGR0)
+   {
+     frame->buf[0] = buf;
+@@ -283,7 +302,7 @@ bool CDecoder::GetPicture(AVCodecContext* avctx, AVFrame* frame, DVDVideoPicture
+   if (!ret)
+     return false;
+ 
+-  if ((frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_BGR0 && frame->format != AV_PIX_FMT_RGB565LE) ||
++  if ((frame->format != AV_PIX_FMT_YUV420P && frame->format != AV_PIX_FMT_SAND128 && frame->format != AV_PIX_FMT_BGR0 && frame->format != AV_PIX_FMT_RGB565LE) ||
+       frame->buf[1] != nullptr || frame->buf[0] == nullptr)
+     return false;
+ 
+
+From ed215d6a95935eabbbb5f56d9259b24e8ab4929d Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 19 May 2017 15:10:42 +0100
+Subject: [PATCH 75/75] ffmpeg: hevc: Update to latest version
+
+---
+ .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 11940 ++++++++++++-------
+ 1 file changed, 7660 insertions(+), 4280 deletions(-)
+
+diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+index 58379fb0874521205184c53be5aae893cfd39d49..96cfa9ae30e72b377b2561cf7a329e02b9212ceb 100644
+--- a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+@@ -11,7 +11,7 @@ index 524fb73..305632b 100644
+  /ffplay
+  /ffprobe
+ diff --git a/ffmpeg.c b/ffmpeg.c
+-index 9ffd833..7a86d7e 100644
++index 9ffd833..e2474e5 100644
+ --- a/ffmpeg.c
+ +++ b/ffmpeg.c
+ @@ -23,6 +23,11 @@
+@@ -52,7 +52,7 @@ index 9ffd833..7a86d7e 100644
+  #if HAVE_SYS_RESOURCE_H
+  #include <sys/time.h>
+  #include <sys/types.h>
+-@@ -158,6 +182,169 @@ static int restore_tty;
++@@ -158,6 +182,182 @@ static int restore_tty;
+  static void free_input_threads(void);
+  #endif
+  
+@@ -100,7 +100,7 @@ index 9ffd833..7a86d7e 100644
+ +  mmal_buffer_header_release(buffer);
+ +}
+ +
+-+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
+++static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
+ +{
+ +    MMAL_COMPONENT_T* display;
+ +    MMAL_DISPLAYREGION_T region =
+@@ -111,7 +111,7 @@ index 9ffd833..7a86d7e 100644
+ +        .fullscreen = 0,
+ +        .dest_rect = {x, y, w, h}
+ +    };
+-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
+++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
+ +
+ +    bcm_host_init();  // TODO is this needed?
+ +    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+@@ -121,7 +121,7 @@ index 9ffd833..7a86d7e 100644
+ +
+ +    {
+ +        MMAL_ES_FORMAT_T* format = display->input[0]->format;
+-+        format->encoding = MMAL_ENCODING_I420;
+++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
+ +        format->es->video.width = geo.stride_y;
+ +        format->es->video.height = geo.height_y;
+ +        format->es->video.crop.x = 0;
+@@ -138,7 +138,7 @@ index 9ffd833..7a86d7e 100644
+ +    mmal_port_enable(display->input[0],display_cb_input);
+ +    mmal_port_enable(display->control,display_cb_control);
+ +
+-+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
+++    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
+ +
+ +    return display;
+ +}
+@@ -168,12 +168,24 @@ index 9ffd833..7a86d7e 100644
+ +#ifdef RPI_ZERO_COPY
+ +{
+ +    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+++    if (fr_buf == NULL) {
+++        mmal_buffer_header_release(buf);
+++        return;
+++    }
+ +
+ +    buf->user_data = fr_buf;
+ +    buf->data = av_rpi_zc_vc_handle(fr_buf);
+-+    buf->alloc_size =
+-+        buf->length = av_rpi_zc_numbytes(fr_buf);
+-+
+++    buf->offset = av_rpi_zc_offset(fr_buf);
+++    buf->length = av_rpi_zc_length(fr_buf);
+++    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+++#if 0
+++    {
+++        unsigned int n;
+++        for (n = 0; n < fr->width; n += 128) {
+++            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
+++        }
+++    }
+++#endif
+ +    ++rpi_display_count;
+ +}
+ +#else
+@@ -208,6 +220,7 @@ index 9ffd833..7a86d7e 100644
+ +
+ +static void display_exit(MMAL_COMPONENT_T* display)
+ +{
+++//    sleep(120);
+ +    if (display) {
+ +        mmal_component_destroy(display);
+ +    }
+@@ -222,7 +235,7 @@ index 9ffd833..7a86d7e 100644
+  /* sub2video hack:
+     Convert subtitles to video with alpha to insert them in filter graphs.
+     This is a temporary solution until libavfilter gets real subtitles support.
+-@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
++@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret)
+          avformat_close_input(&input_files[i]->ctx);
+          av_freep(&input_files[i]);
+      }
+@@ -234,7 +247,7 @@ index 9ffd833..7a86d7e 100644
+      for (i = 0; i < nb_input_streams; i++) {
+          InputStream *ist = input_streams[i];
+  
+-@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
++@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret)
+          av_freep(&ist->filters);
+          av_freep(&ist->hwaccel_device);
+  
+@@ -244,7 +257,7 @@ index 9ffd833..7a86d7e 100644
+          avcodec_free_context(&ist->dec_ctx);
+  
+          av_freep(&input_streams[i]);
+-@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
++@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret)
+      }
+      term_exit();
+      ffmpeg_exited = 1;
+@@ -252,7 +265,7 @@ index 9ffd833..7a86d7e 100644
+  }
+  
+  void remove_avoptions(AVDictionary **a, AVDictionary *b)
+-@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
++@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s,
+      if (ost->source_index >= 0)
+          ist = input_streams[ost->source_index];
+  
+@@ -260,7 +273,7 @@ index 9ffd833..7a86d7e 100644
+ +    if (next_picture && ist != NULL)
+ +    {
+ +        if (!rpi_display)
+-+           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
+++            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
+ +        display_frame(ist->dec_ctx, rpi_display, next_picture);
+ +    }
+ +#endif
+@@ -268,7 +281,7 @@ index 9ffd833..7a86d7e 100644
+      if (filter->inputs[0]->frame_rate.num > 0 &&
+          filter->inputs[0]->frame_rate.den > 0)
+          duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
+-@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
++@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+          ist->dec_ctx->opaque                = ist;
+          ist->dec_ctx->get_format            = get_format;
+          ist->dec_ctx->get_buffer2           = get_buffer;
+@@ -282,22 +295,23 @@ index 9ffd833..7a86d7e 100644
+  
+          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
+ diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+-index fd0d1f0..40d22d2 100644
++index fd0d1f0..1740768 100644
+ --- a/libavcodec/Makefile
+ +++ b/libavcodec/Makefile
+-@@ -5,6 +5,11 @@ NAME = avcodec
++@@ -5,6 +5,12 @@ NAME = avcodec
+  HEADERS = avcodec.h                                                     \
+            avdct.h                                                       \
+            avfft.h                                                       \
+ +          rpi_qpu.h                                                     \
+ +          rpi_shader.h                                                  \
+++	  rpi_shader_cmd.h                                              \
+ +          rpi_mailbox.h                                                 \
+ +          rpi_hevc_transform.h                                          \
+ +          rpi_zc.h                                                      \
+            d3d11va.h                                                     \
+            dirac.h                                                       \
+            dv_profile.h                                                  \
+-@@ -43,6 +48,10 @@ OBJS = allcodecs.o                                                      \
++@@ -43,6 +49,10 @@ OBJS = allcodecs.o                                                      \
+         resample.o                                                       \
+         resample2.o                                                      \
+         utils.o                                                          \
+@@ -308,6 +322,22 @@ index fd0d1f0..40d22d2 100644
+         vorbis_parser.o                                                  \
+         xiph.o                                                           \
+  
++@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
++ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
++ $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
++ endif
+++
+++QASM := $(SUBDIR)../pi-util/qasm.py
+++
+++ifneq ("$(wildcard $(QASM))","")
+++$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+++	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+++
+++$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+++	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+++endif
+++
+++$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
+ diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+ index 54efaad..02a89c3 100644
+ --- a/libavcodec/allcodecs.c
+@@ -321,12 +351,14 @@ index 54efaad..02a89c3 100644
+      REGISTER_PARSER(MJPEG,              mjpeg);
+      REGISTER_PARSER(MLP,                mlp);
+ diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+-index a4ceca7..1354c14 100644
++index a4ceca7..cafd25d 100644
+ --- a/libavcodec/arm/Makefile
+ +++ b/libavcodec/arm/Makefile
+-@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
++@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
++ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+++                                          arm/hevc_misc_neon.o          \
+                                            arm/hevcdsp_deblock_neon.o    \
+ +                                          arm/hevcdsp_epel_neon.o       \
+                                            arm/hevcdsp_idct_neon.o       \
+@@ -1015,18 +1047,592 @@ index 0000000..31d3c59
+ +#endif /* HAVE_ARMV6T2_INLINE */
+ +
+ +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
++diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
++new file mode 100644
++index 0000000..373576b
++--- /dev/null
+++++ b/libavcodec/arm/hevc_misc_neon.S
++@@ -0,0 +1,62 @@
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++@ rpi_zap_coeff_vals_neon(
+++@   uint16_t * buf,          [r0]
+++@   unsigned int log_n_m2)   [r1]
+++
+++function rpi_zap_coeff_vals_neon, export=1
+++        vmov.i64 q8, #0
+++        adr     r12, zc_tab
+++        vmov.i64 q9, #0
+++        tst     r0, #63
+++        vmov.i64 q10, #0
+++        add     r0, #63
+++        vmov.i64 q11, #0
+++        and     r0, #~63
+++        ldr     pc, [r12, r1, lsl #2]
+++
+++zc_tab:
+++        .word   zc_lc2
+++        .word   zc_lc3
+++        .word   zc_lc4
+++        .word   zc_lc5
+++
+++@ 4*4*2: "32 bytes" 64 or 0 depending on dst address
+++zc_lc2:
+++        it eq
+++        vstmeq  r0, {q8-q11}
+++        bx      lr
+++
+++@ 16*16*2 = 512 = 64 * 8
+++zc_lc4:
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++        vstm    r0!, {q8-q11}
+++@ 8*8*2 = 128
+++zc_lc3:
+++        vstm    r0!, {q8-q11}
+++        vstm    r0,  {q8-q11}
+++        bx      lr
+++
+++@ 32*32*2 = 2048 = 128 * 16
+++zc_lc5:
+++        vmov.i64 q12, #0
+++        vmov.i64 q13, #0
+++        vmov.i64 q14, #0
+++        vmov.i64 q15, #0
+++        mov     r2, #4
+++1:
+++        vstm    r0!, {q8-q15}
+++        subs    r2, #1
+++        vstm    r0!, {q8-q15}
+++        vstm    r0!, {q8-q15}
+++        vstm    r0!, {q8-q15}
+++        bne     1b
+++        bx      lr
+++
+++endfunc
+++
+ diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+-index 166bddb..a088cc3 100644
++index 166bddb..9bd0a42 100644
+ --- a/libavcodec/arm/hevcdsp_deblock_neon.S
+ +++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+-@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
++@@ -15,7 +15,7 @@
++  *
++  * You should have received a copy of the GNU Lesser General Public
++  * License along with FFmpeg; if not, write to the Free Software
++- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
++  */
++ 
++ 
++@@ -31,6 +31,9 @@
++         bxeq     lr
++ .endm
++ 
+++@ Uses: d2, d4, d18, d19
+++@ Returns: d2, d4
+++@ Modifies: d0-d7, d22-d25
++ .macro hevc_loop_filter_chroma_body
++         vsubl.u8  q3, d4, d2
++         vsubl.u8  q11, d18, d19
++@@ -49,6 +52,33 @@
++         vqmovun.s16 d4, q2
++ .endm
++ 
+++
+++@ Uses r2[0:7], r2[8:15]
+++@ Modifies: d0-d7, d22-d25
+++.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
+++        vsubl.u8  q3, \Q0, \P0
+++        vsubl.u8  q11, \P1, \Q1
+++        vshl.i16  q3, #2
+++        vadd.i16  q11, q3
+++
+++        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
+++        vdup.16   d0, r2
+++        vmovl.u8  q0, d0
+++        vuzp.16   d0, d1
+++
+++        vrshr.s16 q11, q11, #3
+++        vneg.s16  q12, q0
+++        vmovl.u8  q2, \Q0
+++        vmin.s16  q11, q11, q0
+++        vmax.s16  q11, q11, q12
+++        vaddw.u8  q1, q11, \P0
+++        vsub.i16  q2, q11
+++        vqmovun.s16 \P0, q1
+++        vqmovun.s16 \Q0, q2
+++.endm
+++
+++
+++
++ .macro hevc_loop_filter_luma_start
++         ldr     r12, [r3]
++         ldr      r3, [r3, #4]
++@@ -60,15 +90,17 @@
++         lsr      r3, #16
++ .endm
++ 
++-.macro hevc_loop_filter_luma_body
+++@ Uses: r2, r3, r12
+++@ Modifies: r5, r6, r7, r8, r9
+++function hevc_loop_filter_luma_body
+++        vmovl.u8  q15, d23
+++        vmovl.u8  q14, d22
+++        vmovl.u8  q13, d21
+++        vmovl.u8  q12, d20
+++        vmovl.u8  q11, d19
+++        vmovl.u8  q10, d18
+++        vmovl.u8  q9, d17
++         vmovl.u8  q8, d16
++-        vmovl.u8  q9, d18
++-        vmovl.u8  q10, d20
++-        vmovl.u8  q11, d22
++-        vmovl.u8  q12, d24
++-        vmovl.u8  q13, d26
++-        vmovl.u8  q14, d28
++-        vmovl.u8  q15, d30
++ 
++         vadd.i16   q7, q9, q11
++         vadd.i16   q6, q14, q12
++@@ -77,7 +109,6 @@
++         vabd.s16   q7, q7, q10
++         vabd.s16   q6, q6, q13
++ 
++-
++         vdup.16    q0, r2
++         vmov       q4, q7
++         vmov       q5, q6
++@@ -152,7 +183,7 @@
++ 
++         and        r9, r8, r7
++         cmp        r9, #0
++-        beq        weakfilter_\@
+++        beq        weakfilter_
++ 
++         vadd.i16  q2, q11, q12
++         vadd.i16  q4, q9, q8
++@@ -210,11 +241,11 @@
++         vbit      q13, q3, q5
++         vbit      q14, q2, q5
++ 
++-weakfilter_\@:
+++weakfilter_:
++         mvn       r8, r8
++         and       r9, r8, r7
++         cmp       r9, #0
++-        beq       ready_\@
+++        beq       ready_
++ 
++         vdup.16    q4, r2
++ 
++@@ -275,75 +306,345 @@ weakfilter_\@:
++         vbit      q11, q0, q5
++         vbit      q12, q4, q5
++ 
++-ready_\@:
+++ready_:
++         vqmovun.s16 d16, q8
++-        vqmovun.s16 d18, q9
++-        vqmovun.s16 d20, q10
++-        vqmovun.s16 d22, q11
++-        vqmovun.s16 d24, q12
++-        vqmovun.s16 d26, q13
++-        vqmovun.s16 d28, q14
++-        vqmovun.s16 d30, q15
++-.endm
+++        vqmovun.s16 d17, q9
+++        vqmovun.s16 d18, q10
+++        vqmovun.s16 d19, q11
+++        vqmovun.s16 d20, q12
+++        vqmovun.s16 d21, q13
+++        vqmovun.s16 d22, q14
+++        vqmovun.s16 d23, q15
+++        mov       pc, lr
+++endfunc
+++
+++@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
+++function ff_hevc_v_loop_filter_luma2_neon_8, export=1
+++        hevc_loop_filter_luma_start
+++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
+++
+++        ldr      r4, [sp, #40]
+++        b        v_loop_luma_common
+++endfunc
+++
++ 
++ function ff_hevc_v_loop_filter_luma_neon, export=1
++         hevc_loop_filter_luma_start
++-        push     {r5-r11}
+++        push     {r4-r10,lr}
+++
+++        sub      r4, r0, #4
+++v_loop_luma_common:
+++        @ Why this isn't a bitmask to start with I have no idea...
+++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
+++        ldr      r5, [sp, #32]
+++        ldrh     r10, [r5]
+++        ldr      r5, [sp, #36]
+++        ldrh     r5, [r5]
+++        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
+++
++         vpush    {d8-d15}
++-        sub      r0, #4
++-        vld1.8   {d16}, [r0], r1
++-        vld1.8   {d18}, [r0], r1
++-        vld1.8   {d20}, [r0], r1
++-        vld1.8   {d22}, [r0], r1
++-        vld1.8   {d24}, [r0], r1
++-        vld1.8   {d26}, [r0], r1
++-        vld1.8   {d28}, [r0], r1
++-        vld1.8   {d30}, [r0], r1
++-        sub      r0, r0, r1, lsl #3
++-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
++-        hevc_loop_filter_luma_body
++-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
++-        vst1.8   {d16}, [r0], r1
++-        vst1.8   {d18}, [r0], r1
++-        vst1.8   {d20}, [r0], r1
++-        vst1.8   {d22}, [r0], r1
++-        vst1.8   {d24}, [r0], r1
++-        vst1.8   {d26}, [r0], r1
++-        vst1.8   {d28}, [r0], r1
++-        vst1.8   {d30}, [r0]
+++
+++        @ Uses slightly fewer instructions to do laned loads than unlaned
+++        @ and transpose.  This also means that we can use the same code for
+++        @ both split & unsplit deblock
+++        vld4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
+++        vld4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
+++
+++        vld4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+++        vld4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+++
+++        vld4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
+++        vld4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
+++
+++        vld4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+++        vld4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+++
+++        vld4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
+++        vld4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
+++
+++        vld4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+++        vld4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+++
+++        vld4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+++        vld4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+++
+++        vld4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
+++        vld4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
+++
+++        bl hevc_loop_filter_luma_body
+++
+++        neg     r1, r1
+++
+++        @ no_p[1]
+++        tst     r10, #0xff00
+++        itt ne
+++        addne    r4, r4, r1, lsl #2
+++        bne     1f
+++        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
+++        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+++        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
+++
+++1:
+++        @ no_q[1]
+++        tst     r10, #0xff000000
+++        itt ne
+++        addne    r0, r0, r1, lsl #2
+++        bne     2f
+++        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
+++        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+++        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
+++
+++2:
+++        @ no_p[0]
+++        tst     r10, #0xff
+++        bne     3f
+++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
+++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
+++
+++3:
+++        @ no_q[0]
+++        tst     r10, #0xff0000
+++        bne     4f
+++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
+++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
+++
+++4:
+++bypasswrite:
++         vpop     {d8-d15}
++-        pop      {r5-r11}
++-        bx lr
+++        pop      {r4-r10,pc}
++ endfunc
++ 
+++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
+++@                                 ptrdiff_t stride, [r1]
+++@                                 int beta,         [r2]
+++@                                 int32_t *tc,      [r3]
+++@                                 uint8_t *no_p,    sp[0]
+++@                                 uint8_t *no_q);   sp[4]
+++@
+++@ Src should always be on 8 byte boundry & all in the same slice
+++
++ function ff_hevc_h_loop_filter_luma_neon, export=1
++         hevc_loop_filter_luma_start
++-        push     {r5-r11}
+++        push     {r4-r10,lr}
+++
++         vpush    {d8-d15}
++         sub      r0, r0, r1, lsl #2
+++
++         vld1.8  {d16}, [r0], r1
+++        vld1.8  {d17}, [r0], r1
++         vld1.8  {d18}, [r0], r1
+++        vld1.8  {d19}, [r0], r1
++         vld1.8  {d20}, [r0], r1
+++        vld1.8  {d21}, [r0], r1
++         vld1.8  {d22}, [r0], r1
++-        vld1.8  {d24}, [r0], r1
++-        vld1.8  {d26}, [r0], r1
++-        vld1.8  {d28}, [r0], r1
++-        vld1.8  {d30}, [r0], r1
++-        sub        r0, r0, r1, lsl #3
++-        add        r0, r1
++-        hevc_loop_filter_luma_body
++-        vst1.8   {d18}, [r0], r1
++-        vst1.8   {d20}, [r0], r1
++-        vst1.8   {d22}, [r0], r1
++-        vst1.8   {d24}, [r0], r1
++-        vst1.8   {d26}, [r0], r1
++-        vst1.8   {d28}, [r0]
++-bypasswrite:
+++        vld1.8  {d23}, [r0]
+++
+++        bl hevc_loop_filter_luma_body
+++
++         vpop     {d8-d15}
++-        pop      {r5-r11}
++-        bx lr
+++
+++        neg     r1, r1
+++        add     r0, r0, r1
+++
+++        @ Why this isn't a bitmask to start with I have no idea...
+++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
+++        ldr      r5, [sp, #32]
+++        ldrh     r10, [r5]
+++        ldr      r5, [sp, #36]
+++        ldrh     r5, [r5]
+++        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+++        bne      1f
+++
+++        vst1.8  {d22}, [r0], r1
+++        vst1.8  {d21}, [r0], r1
+++        vst1.8  {d20}, [r0], r1
+++        vst1.8  {d19}, [r0], r1
+++        vst1.8  {d18}, [r0], r1
+++        vst1.8  {d17}, [r0]
+++
+++        pop      {r4-r10,pc}
+++
+++@ Partial write
+++1:
+++        vmov     r2, r3, d22
+++        vmov     r4, r5, d21
+++        vmov     r6, r7, d20
+++
+++        tst      r10, #0xff0000
+++        ittt eq
+++        streq    r2, [r0]
+++        streq    r4, [r0, r1]
+++        streq    r6, [r0, r1, lsl # 1]
+++
+++        add      r0, r0, #4
+++        tst      r10, #0xff000000
+++        ittt eq
+++        streq    r3, [r0]
+++        streq    r5, [r0, r1]
+++        streq    r7, [r0, r1, lsl # 1]
+++
+++        vmov     r2, r3, d19
+++        vmov     r4, r5, d18
+++        vmov     r6, r7, d17
+++        add      r0, r0, r1
+++        add      r0, r0, r1, lsl # 1
+++
+++        tst      r10, #0xff00
+++        ittt eq
+++        streq    r3, [r0]
+++        streq    r5, [r0, r1]
+++        streq    r7, [r0, r1, lsl # 1]
+++
+++        tst      r10, #0xff
+++        ittt eq
+++        streq    r2, [r0, #-4]!
+++        streq    r4, [r0, r1]
+++        streq    r6, [r0, r1, lsl # 1]
+++
+++        pop      {r4-r10,pc}
+++
++ endfunc
++ 
+++@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
+++@                                     unsigned int stride,   // r1
+++@                                     uint32_t tc4,          // r2
+++@                                     unsigned int no_f);    // r3
+++@
+++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++function ff_hevc_h_loop_filter_uv_neon_8, export=1
+++        sub      r0, r0, r1, lsl #1
+++        vld2.8   {d16,d17}, [r0], r1
+++        vld2.8   {d18,d19}, [r0], r1
+++        vld2.8   {d26,d27}, [r0], r1
+++        vld2.8   {d28,d29}, [r0]
+++        sub      r0, r0, r1, lsl #1
+++        hevc_loop_filter_uv_body d16, d18, d26, d28
+++        lsr      r2, r2, #16
+++        hevc_loop_filter_uv_body d17, d19, d27, d29
+++        cmp      r3, #0
+++        bne      1f
+++        vst2.8   {d18,d19}, [r0], r1
+++        vst2.8   {d26,d27}, [r0]
+++        bx       lr
+++
+++        @ At least one no_f bit is set
+++        @ Which means we need to break this apart in an ugly fashion
+++1:      vzip.8   d18, d19
+++        vzip.8   d26, d27
+++        sub      r1, r1, #8
+++
+++        tst      r3, #1
+++        bne      1f
+++        vst1.8   {d18}, [r0]
+++1:      add      r0, r0, #8
+++        tst      r3, #2
+++        bne      2f
+++        vst1.8   {d19}, [r0]
+++2:      add      r0, r0, r1
+++
+++        tst      r3, #4
+++        bne      1f
+++        vst1.8   {d26}, [r0]
+++1:      add      r0, r0, #8
+++        tst      r3, #8
+++        it ne
+++        bxne     lr
+++        vst1.8   {d27}, [r0]
+++        bx       lr
+++
+++endfunc
+++
+++
+++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+++@                                     unsigned int stride,   // r1
+++@                                     uint32_t tc4,          // r2
+++@                                     uint8_t * src_l,       // r3
+++@                                     unsigned int no_f);   // sp[0]
+++@
+++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++function ff_hevc_v_loop_filter_uv2_neon_8, export=1
+++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
+++        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
+++
+++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+++        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
+++
+++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+++        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
+++
+++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+++        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
+++
+++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+++        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
+++
+++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+++        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
+++
+++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+++        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
+++
+++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
+++        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
+++
+++        hevc_loop_filter_uv_body d16, d18, d26, d28
+++        lsr      r2, r2, #16
+++        hevc_loop_filter_uv_body d17, d19, d27, d29
+++
+++        neg      r1, r1
+++
+++        ldr      r2, [sp, #0]
+++
+++        @ p[1]
+++        tst      r2, #2
+++        itt ne
+++        addne    r3, r3, r1, lsl #2
+++        bne      1f
+++        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
+++        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+++        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+++        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+++
+++1:
+++        @ q[1]
+++        tst      r2, #8
+++        itt ne
+++        addne    r0, r0, r1, lsl #2
+++        bne 2f
+++        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
+++        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
+++        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
+++        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
+++
+++2:
+++        @ p[0]
+++        tst      r2, #1
+++        bne      3f
+++        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+++        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+++        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+++        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
+++
+++3:
+++        @ q[0]
+++        tst      r2, #4
+++        it ne
+++        bxne     lr
+++        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
+++        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
+++        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
+++        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
+++
+++        bx       lr
+++endfunc
+++
+++
++ function ff_hevc_v_loop_filter_chroma_neon, export=1
++         hevc_loop_filter_chroma_start
++         sub      r0, #4
++@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+          vst1.8   {d4}, [r0]
+          bx       lr
+  endfunc
+ +
+-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+-+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
+++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
+++ *                                            int *curr_rpl0, int *curr_
+++ *                                            MvField *curr, MvField *ne
+ + */
+ +function ff_hevc_deblocking_boundary_strengths_neon, export=1
+ +        add         ip, sp, #4*4
+@@ -1147,6 +1753,7 @@ index 166bddb..a088cc3 100644
+ +90:     mov         a3, #1
+ +        b           11b
+ +endfunc
+++
+ diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+ new file mode 100644
+ index 0000000..00eab9e
+@@ -1491,10 +2098,10 @@ index 0000000..00eab9e
+ +       .byte 2, 16, 54, 4
+ +       .byte 2, 10, 58, 2
+ diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
+-index 5591807..49c70dd 100644
++index 5591807..b6c48ee 100644
+ --- a/libavcodec/arm/hevcdsp_init_neon.c
+ +++ b/libavcodec/arm/hevcdsp_init_neon.c
+-@@ -22,6 +22,8 @@
++@@ -22,11 +22,26 @@
+  #include "libavutil/arm/cpu.h"
+  #include "libavcodec/hevcdsp.h"
+  #include "hevcdsp_arm.h"
+@@ -1503,7 +2110,25 @@ index 5591807..49c70dd 100644
+  
+  void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+  void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+-@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++
+++#ifdef RPI
+++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                             const uint8_t no_p[2], const uint8_t no_q[2],
+++                             uint8_t * _pix_l);
+++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                             unsigned int no_f);
+++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                             uint8_t * src_l,
+++                             unsigned int no_f);
+++#endif
+++
++ void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++ void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++ void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
++@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+  void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
+  
+@@ -1522,10 +2147,20 @@ index 5591807..49c70dd 100644
+ +void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+ +void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+ +
+++void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
+++                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
+++
+++void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++
+++
+  #define PUT_PIXELS(name) \
+      void name(int16_t *dst, uint8_t *src, \
+                                  ptrdiff_t srcstride, int height, \
+-@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
++@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+  #undef PUT_PIXELS
+@@ -1541,7 +2176,7 @@ index 5591807..49c70dd 100644
+  
+  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                     int height, int width);
+-@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
++@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+  }
+  
+@@ -1587,6 +2222,50 @@ index 5591807..49c70dd 100644
+ +    }
+ +}
+ +
+++static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+++{
+++    // Width 32 already dealt with
+++    // width 16 code works in double lines
+++    if (width == 16 && (height & 1) == 0) {
+++        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
+++                                          sao_offset_val_u, sao_left_class_u,
+++                                          sao_offset_val_v, sao_left_class_v,
+++                                          width, height);
+++    }
+++    else
+++    {
+++        const int shift  = 3; // BIT_DEPTH - 5
+++        int k, y, x;
+++        pixel *dst = (pixel *)_dst;
+++        pixel *src = (pixel *)_src;
+++        int8_t offset_table_u[32] = { 0 };
+++        int8_t offset_table_v[32] = { 0 };
+++
+++        stride_src /= sizeof(pixel);
+++        stride_dst /= sizeof(pixel);
+++
+++        for (k = 0; k < 4; k++)
+++            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+++        for (k = 0; k < 4; k++)
+++            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+++
+++        for (y = 0; y < height; y++) {
+++            for (x = 0; x < width * 2; x += 2)
+++            {
+++                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
+++                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
+++            }
+++            dst += stride_dst;
+++            src += stride_src;
+++
+++        }
+++    }
+++}
+++
+ +#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+ +static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+ +                                          int16_t *_sao_offset_val, int eo, int width, int height)
+@@ -1665,6 +2344,54 @@ index 5591807..49c70dd 100644
+ +        }
+ +    }
+ +}
+++
+++
+++static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height)
+++{
+++    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+++
+++    if (width == 32 && (height & 7) == 0) {
+++        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
+++    }
+++    else
+++    {
+++        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+++        static const int8_t pos[4][2][2] = {
+++            { { -1,  0 }, {  1, 0 } }, // horizontal
+++            { {  0, -1 }, {  0, 1 } }, // vertical
+++            { { -1, -1 }, {  1, 1 } }, // 45 degree
+++            { {  1, -1 }, { -1, 1 } }, // 135 degree
+++        };
+++        int8_t sao_offset_val_u[8];  // padding of 3 for vld
+++        int8_t sao_offset_val_v[8];  // padding of 3 for vld
+++        pixel *dst = (pixel *)_dst;
+++        pixel *src = (pixel *)_src;
+++        int a_stride, b_stride;
+++        int x, y;
+++
+++        for (x = 0; x < 5; x++) {
+++            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
+++            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
+++        }
+++
+++        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+++        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+++        for (y = 0; y < height; y++) {
+++            for (x = 0; x < width * 2; x += 2) {
+++                int diff0u = CMP(src[x], src[x + a_stride]);
+++                int diff1u = CMP(src[x], src[x + b_stride]);
+++                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+++                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+++                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
+++                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
+++            }
+++            src += stride_src;
+++            dst += stride_dst;
+++        }
+++    }
+++}
+ +#undef CMP
+ +
+ +void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+@@ -1674,18 +2401,36 @@ index 5591807..49c70dd 100644
+  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+  {
+      if (bit_depth == 8) {
+-@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++         int x;
++         c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
+++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon;
++         c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
++         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
++         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+++#ifdef RPI
+++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
+++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
+++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_8;
+++#endif
++         c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
++         c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
++         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
++@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+          c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+          c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+          c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+ +        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
+ +          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
+++          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
+ +          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
+++          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
+ +        }
+++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
+          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+-@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+@@ -1707,7 +2452,7 @@ index 5591807..49c70dd 100644
+          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+-@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+      }
+@@ -1719,10 +2464,10 @@ index 5591807..49c70dd 100644
+  }
+ diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
+ new file mode 100644
+-index 0000000..9c7808d
++index 0000000..08a021d
+ --- /dev/null
+ +++ b/libavcodec/arm/hevcdsp_sao_neon.S
+-@@ -0,0 +1,510 @@
++@@ -0,0 +1,862 @@
+ +/*
+ + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ + *
+@@ -1848,24 +2593,186 @@ index 0000000..9c7808d
+ +
+ +function ff_hevc_sao_band_w64_neon_8, export=1
+ +        init_sao_band
+++
+++        push      {r4, lr}
+++        subs      r12, #1
+++        mov       r4, r1
+++        it ne
+++        addne     r4, r3
+++
+ +1:      subs      r12, #1
+-+        pld       [r1, r3]
+-+        vld1.8    {q8-q9}, [r1, :128]!
+-+        vshr.u8  q12, q8, #3
+-+        vshr.u8  q13, q9, #3
+-+        vld1.8    {q10-q11}, [r1, :128], r3
+-+        vshr.u8  q14, q10, #3
+-+        vshr.u8  q15, q11, #3
+-+        sub       r1, #32
+++        vldm      r1, {q8-q11}
+++        pld       [r4]
+++        vshr.u8   q12, q8, #3
+++        vshr.u8   q13, q9, #3
+++        add       r1, r3
+++        vshr.u8   q14, q10, #3
+++        vshr.u8   q15, q11, #3
+ +        sao_band_64
+-+        vst1.8    {q8-q9}, [r0, :128]!
+-+        vst1.8    {q10-q11}, [r0, :128], r2
+++        it ne
+++        addne     r4, r3
+++        vstm      r0, {q8-q11}
+++        add       r0, r2
+++        bpl       1b
+++
+++        pop       {r4, pc}
+++endfunc
+++
+++
+++@ ff_hevc_sao_band_c_w64_neon_8(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+++
+++@ As this is often done in-place on the frame buffer it is worth preloading
+++@ the pixel values but we want to beware of loading ouside our buffer to avoid
+++@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
+++
+++function ff_hevc_sao_band_c_neon_8, export=1
+++        mov     r12, sp
+++        push   {r4-r8, lr}  // 24 bytes
+++
+++        ldm     r12, {r4-r7}
+++
+++        add     r4, #2
+++        add     r6, #2
+++        vld1.16 {d16}, [r4]    @ Unaligned
+++        lsl     r5, r5, #3
+++        vld1.16 {d18}, [r6]
+++        pld     [r1]
+++        vmov.i8  d17, #0
+++        mov     r4, r1
+++        vmov.i8  d19, #0
+++        lsl     r7, r7, #3
+++        vdup.8  q1, r5
+++        ldr     r5, [r12, #16]  @ width
+++        vdup.8  q2, r7
+++        ldr     r12, [r12, #20]
+++        vqmovn.s16 d0, q8
+++        cmp     r5, #16         @ At some point we may want a table lookup
+++        vqmovn.s16 d1, q9
+++        vmov.i8 q3, #128
+++        beq     16f
+++
+++        @ d0 U lookup
+++        @ d1 V lookup
+++        @ q1 U raw offset
+++        @ q2 V raw offset
+++        @ q3 #128
+++
+++        @ r4 = r1 = src - Inteded for preload pointer
+++        @ r12 = height
+++
+++        @ Might (unlikely) be called with height == 1
+++        subs      r12, #1
+++        it ne
+++        addne     r4, r3
+++
+++1:
+++        subs      r12, #1
+++        vld2.8    {q8-q9}, [r1, :128]!
+++        vsub.u8   q12, q8, q1
+++        vld2.8    {q10-q11}, [r1, :128], r3
+++        vsub.u8   q14, q10, q1
+++        vsub.u8   q13, q9, q2
+++        sub       r1, #32
+++        vsub.u8   q15, q11, q2
+++        pld       [r4]
+++        vshr.u8   q12, #3
+++        vadd.s8   q8, q3
+++        vshr.u8   q13, #3
+++        vadd.s8   q9, q3
+++
+++        vtbl.8   d24, {d0}, d24
+++        vshr.u8  q14, #3
+++        vtbl.8   d25, {d0}, d25
+++        vshr.u8  q15, #3
+++        vtbl.8   d26, {d1}, d26
+++        vadd.s8  q10, q3
+++        vtbl.8   d27, {d1}, d27
+++        vadd.s8  q11, q3
+++        vtbl.8   d28, {d0}, d28
+++        vqadd.s8 q8, q12
+++        vtbl.8   d29, {d0}, d29
+++        vqadd.s8 q9, q13
+++        vtbl.8   d30, {d1}, d30
+++        vqadd.s8 q10, q14
+++        vtbl.8   d31, {d1}, d31
+++        vsub.s8  q8, q3
+++        vqadd.s8 q11, q15
+++        vsub.s8  q9, q3
+++        vsub.s8  q10, q3
+++        vsub.s8  q11, q3
+++
+++        it ne
+++        addne     r4, r3        @ Do not inc on final pass
+++        vst2.8    {q8-q9}, [r0, :128]!
+++        vst2.8    {q10-q11}, [r0, :128], r2
+ +        sub       r0, #32
+-+        bne       1b
+++        bpl       1b
+++
+++        pop    {r4-r8, pc}
+++
+++@ -- width 16 (UV pairs) --
+++16:
+++        subs    r12, #2
+++        it ne
+++        addne   r4, r4, r3, lsl #1
+++
+++1:
+++        subs      r12, #2
+++        vld2.8    {q8-q9}, [r1, :128], r3
+++        vsub.u8   q12, q8, q1
+++        vld2.8    {q10-q11}, [r1, :128], r3
+++        vsub.u8   q14, q10, q1
+++        vsub.u8   q13, q9, q2
+++        pld       [r4]
+++        vsub.u8   q15, q11, q2
+++        pld       [r4, r3]
+++        vshr.u8  q12, #3
+++        vadd.s8  q8, q3
+++        vshr.u8  q13, #3
+++        vadd.s8  q9, q3
+++
+++        vtbl.8   d24, {d0}, d24
+++        vshr.u8  q14, #3
+++        vtbl.8   d25, {d0}, d25
+++        vshr.u8  q15, #3
+++        vtbl.8   d26, {d1}, d26
+++        vadd.s8  q10, q3
+++        vtbl.8   d27, {d1}, d27
+++        vadd.s8  q11, q3
+++        vtbl.8   d28, {d0}, d28
+++        vqadd.s8 q8, q12
+++        vtbl.8   d29, {d0}, d29
+++        vqadd.s8 q9, q13
+++        vtbl.8   d30, {d1}, d30
+++        vqadd.s8 q10, q14
+++        vtbl.8   d31, {d1}, d31
+++        vsub.s8  q8, q3
+++        vqadd.s8 q11, q15
+++        vsub.s8  q9, q3
+++        vsub.s8  q10, q3
+++        vsub.s8  q11, q3
+++
+++        it ne
+++        addne   r4, r4, r3, lsl #1
+++        vst2.8    {q8-q9}, [r0, :128], r2
+++        vst2.8    {q10-q11}, [r0, :128], r2
+++        bpl       1b
+++
+++        pop    {r4-r8, pc}
+ +
+-+        bx lr
+ +endfunc
+ +
+++
+ +.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
+ +        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
+ +        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
+@@ -1875,71 +2782,120 @@ index 0000000..9c7808d
+ +        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
+ +.endm
+ +
+-+.macro table64
+-+        vmov.s8 q13, #2 // 2 to all elements
+-+        vmov.32  d24[0], r4  // load offset table from general registers
+-+        vmov.32  d24[1], r5  // load rest of offset table
+-+
+-+        vadd.s8 q0, q13
+-+        vadd.s8 q1, q13
+-+        vadd.s8 q2, q13
+-+        vadd.s8 q3, q13
+-+
+-+        vmov.u8  q15, #128 // s8 #-128
+-+        vtbl.8   d0, {d24}, d0
+-+        vadd.s8  q13,  q4, q15
+-+        vtbl.8   d1, {d24}, d1
+-+        vadd.s8  q14,  q5, q15
+-+        vtbl.8   d2, {d24}, d2
+-+        vqadd.s8 q0, q13
+-+        vtbl.8   d3, {d24}, d3
+-+        vqadd.s8 q1, q14
+-+        vtbl.8   d4, {d24}, d4
+-+        vadd.s8  q13,  q6, q15
+-+        vtbl.8   d5, {d24}, d5
+-+        vadd.s8  q14,  q7, q15
+-+        vtbl.8   d6, {d24}, d6
+-+        vqadd.s8 q2, q13
+-+        vtbl.8   d7, {d24}, d7
+-+        vqadd.s8 q3, q14
+-+        vsub.s8   q0, q15
+-+        vsub.s8   q1, q15
+-+        vsub.s8   q2, q15
+-+        vsub.s8   q3, q15
+-+        vst1.8  {q0-q1}, [r0, :128]!
+-+        vst1.8  {q2-q3}, [r0, :128], r2
+-+        sub     r0, #32
+-+.endm
+ +
+ +// input
+ +// a in q0 - q3
+ +// c in q4 - q7
+ +// b in q8 - q11
+-+// offset table in r7 and r5
+++// offset table r4,r5 and r6,r7
+++//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
+ +// output in q0 - q3
+ +// clobbers q12 - q15
+-+.macro edge_w64_body
+-+        diff32 q12, q13, q0, q1, q0, q1, q4, q5
+-+        diff32 q0, q1, q14, q15, q8, q9, q4, q5
+ +
+-+        vadd.s8  q0, q12 //diff0 + diff1
+-+        vadd.s8  q1, q13
+++@ a <- c <- b
+++@
+++@ It appears that Neon can stall if you try and use results too soon so we try to
+++@ spread our instruction out
+++
+++.macro edgeidx64
+++
+++        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
+++        vcgt.u8 q13, q5, q1
+++        vcgt.u8 q14, q6, q2
+++        vcgt.u8 q15, q7, q3
+++
+++        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
+++        vcgt.u8 q1, q1, q5
+++        vcgt.u8 q2, q2, q6
+++        vcgt.u8 q3, q3, q7
+++
+++        vsub.s8 q0, q0, q12 // a = sign(c-a)
+++        vsub.s8 q1, q1, q13
+++        vsub.s8 q2, q2, q14
+++        vsub.s8 q3, q3, q15
+++
+++        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
+++        vcgt.u8 q13, q5, q9
+++        vcgt.u8 q14, q6, q10
+++        vcgt.u8 q15, q7, q11
+++
+++        vsub.s8 q0, q0, q12
+++        vsub.s8 q1, q1, q13
+++        vsub.s8 q2, q2, q14
+++        vsub.s8 q3, q3, q15
+++
+++        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
+++        vcgt.u8 q13, q9, q5
+++        vcgt.u8 q14, q10, q6
+++        vcgt.u8 q15, q11, q7
+++
+++        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
+++        vadd.s8 q1, q1, q13
+++        vmov.u8 q12, #2
+++        vadd.s8 q2, q2, q14
+++        vadd.s8 q3, q3, q15
+++
+++        vadd.s8 q0, q0, q12
+++        vadd.s8 q1, q1, q12
+++        @ whilst vmov dn, rm, rn exists it is a vfp instruction
+++        @ and causes a stall till neon pipe empty - so don't do that!
+++        vmov    d26[0], r4
+++        vmov    d26[1], r5
+++        vmov    d27[0], r6
+++        vmov    d27[1], r7
+++        vadd.s8 q2, q2, q12
+++        vuzp.8    q0, q1
+++        vmov.u8 q15, #128
+++        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
+++
+++        vtbl.8  d0, {d26}, d0
+++        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
+++
+++        vtbl.8  d1, {d26}, d1
+++        vadd.s8 q14, q5, q15
+++
+++        vtbl.8  d2, {d27}, d2
+++        vuzp.8    q2, q3
+++
+++        vtbl.8  d3, {d27}, d3
+++
+++        vtbl.8  d4, {d26}, d4
+++        vzip.8    q0, q1
+++
+++        vtbl.8  d5, {d26}, d5
+++        vqadd.s8 q0, q0, q12
+++        vqadd.s8 q1, q1, q14
+++        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
+++
+++        vtbl.8  d6, {d27}, d6
+++        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
+++
+++        vtbl.8  d7, {d27}, d7
+++        vzip.8   q2, q3
+++
+++        vsub.s8 q0, q0, q15
+++        vqadd.s8 q2, q2, q12
+++        vqadd.s8 q3, q3, q14
+++        vsub.s8 q1, q1, q15
+++        vsub.s8 q2, q2, q15
+++        vsub.s8 q3, q3, q15
+ +
+-+        diff32  q14, q15, q2, q3, q2, q3, q6, q7
+-+        diff32  q2, q3, q12, q13, q10, q11, q6, q7
+-+
+-+        vadd.s8  q2, q14
+-+        vadd.s8  q3, q15
+-+        table64
+ +.endm
+ +
+++function edge_w64_body
+++        edgeidx64
+++        vstm    r0, {q0-q3}
+++        add     r0, r0, r2
+++        bx       lr
+++endfunc
+++
+ +.macro init_edge_64
+-+        push   {r4-r5}
+-+        ldr    r12, [sp, #8] // height
+-+        ldr    r5, [sp, #12] // sao_offset_val_table
+-+        ldr    r4, [r5]
+-+        add    r5, #4
+-+        ldr    r5, [r5]
+++        push   {r4-r8,lr}
+++        ldr    r12, [sp, #24] // height
+++        ldr    r5,  [sp, #28] // sao_offset_val_table
+++        ldrd   r4, r5, [r5]
+++        mov    r6, r4
+++        mov    r7, r5
+ +.endm
+ +
+ +function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
+@@ -1962,11 +2918,10 @@ index 0000000..9c7808d
+ +        vext.8 q9, q5, q6, #1
+ +        vext.8 q10, q6, q7, #1
+ +        vext.8 q11, q7, q12, #1
+-+        edge_w64_body
+++        bl    edge_w64_body
+ +        bne   1b
+ +        vpop  {d8-d15}
+-+        pop   {r4-r5}
+-+        bx lr
+++        pop   {r4-r8,pc}
+ +endfunc
+ +
+ +function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
+@@ -1986,7 +2941,7 @@ index 0000000..9c7808d
+ +        vld1.8  {q8-q9}, [r1, :128]!
+ +        vld1.8  {q10-q11}, [r1, :128], r3
+ +        sub     r1, #32
+-+        edge_w64_body
+++        bl      edge_w64_body
+ +        // copy c to a
+ +        vmov.64 q0, q4
+ +        vmov.64 q1, q5
+@@ -1999,8 +2954,7 @@ index 0000000..9c7808d
+ +        vmov.64 q7, q11
+ +        bne   1b
+ +        vpop  {d8-d15}
+-+        pop   {r4-r5}
+-+        bx lr
+++        pop   {r4-r8,pc}
+ +endfunc
+ +
+ +function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
+@@ -2024,11 +2978,10 @@ index 0000000..9c7808d
+ +        vld1.8  {q8-q9}, [r1]!
+ +        vld1.8  {q10-q11}, [r1]
+ +        sub     r1, #33
+-+        edge_w64_body
+++        bl      edge_w64_body
+ +        bne   1b
+ +        vpop  {d8-d15}
+-+        pop   {r4-r5}
+-+        bx lr
+++        pop   {r4-r8,pc}
+ +endfunc
+ +
+ +function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
+@@ -2052,13 +3005,157 @@ index 0000000..9c7808d
+ +        vld1.8  {q8-q9}, [r1]!
+ +        vld1.8  {q10-q11}, [r1]
+ +        sub     r1, #31
+-+        edge_w64_body
+++        bl      edge_w64_body
+ +        bne   1b
+ +        vpop  {d8-d15}
+-+        pop   {r4-r5}
+-+        bx lr
+++        pop   {r4-r8,pc}
+++endfunc
+++
+++
+++@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
+++@   uint8_t *_dst,               r0
+++@   uint8_t *_src,               r1
+++@   ptrdiff_t stride_dst,        r2
+++@   ptrdiff_t stride_src,        r3
+++@   int height,                  sp[0]
+++@   int16_t *sao_offset_table_u,  sp[4]
+++@   int16_t *sao_offset_table_v); sp[8]
+++@   int eo                        sp[12]
+++
+++function ff_hevc_sao_edge_c_w64_neon_8, export=1
+++        push   {r4-r8,lr}     // 6 reg = 24
+++        ldr    r5,  [sp, #28] // sao_offset_val_table_u
+++        ldr    r7,  [sp, #32] // sao_offset_val_table_v
+++
+++        @ Load and rearrange offsets
+++        @ Also "convert" from 16bit to 8bit
+++        ldrb    r4, [r5, #2]
+++        ldrb    r8, [r5, #4]
+++        ldrb    r6, [r7, #2]
+++        ldrb    r12, [r7, #4]
+++        orr     r4, r4, r8, lsl #8
+++        orr     r6, r6, r12, lsl #8
+++        ldrb    r8, [r5, #6]
+++        ldrb    r12, [r7, #6]
+++        orr     r4, r4, r8, lsl #24
+++        orr     r6, r6, r12, lsl #24
+++        ldrb    r5, [r5, #8]
+++        ldrb    r7, [r7, #8]
+++
+++        ldr     r12, [sp, #36] // e0
+++        adr     r8, edge_c_tbl_w64
+++        ldr     r8, [r8, r12, lsl #2]
+++
+++        ldr     r12, [sp, #24] // height
+++        vpush   {d8-d15}
+++        mov     pc, r8
+++
+++edge_c_tbl_w64:
+++        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
+++        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
+++        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
+++        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
+++
+++ff_hevc_sao_edge_c_eo0_w64_neon_8:
+++        sub    r1, #8
+++1:      subs    r12, #1
+++        vld1.64  {d7}, [r1, :64]!
+++        vld1.64  {q4-q5}, [r1, :128]! // load c
+++        vld1.64  {q6-q7}, [r1, :128]!
+++        vld1.64  {d24}, [r1, :64], r3
+++        sub      r1, #72
+++        // load a
+++        vext.8 q0, q3, q4, #14
+++        vext.8 q1, q4, q5, #14
+++        vext.8 q2, q5, q6, #14
+++        vext.8 q3, q6, q7, #14
+++        // load b
+++        vext.8 q8, q4, q5, #2
+++        vext.8 q9, q5, q6, #2
+++        vext.8 q10, q6, q7, #2
+++        vext.8 q11, q7, q12, #2
+++        bl    edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r8,pc}
+++
+++ff_hevc_sao_edge_c_eo1_w64_neon_8:
+++        sub     r1, r3
+++        // load a
+++        vldm    r1, {q0-q3}
+++        add     r1, r3
+++        // load c
+++        vldm    r1, {q4-q7}
+++        add     r1, r3
+++1:      subs    r12, #1
+++        // load b
+++        vldm    r1, {q8-q11}
+++        add     r1, r3
+++        bl      edge_w64_body
+++        // copy c to a
+++        vmov.64 q0, q4
+++        vmov.64 q1, q5
+++        vmov.64 q2, q6
+++        vmov.64 q3, q7
+++        // copy b to c
+++        vmov.64 q4, q8
+++        vmov.64 q5, q9
+++        vmov.64 q6, q10
+++        vmov.64 q7, q11
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r8,pc}
+++
+++ff_hevc_sao_edge_c_eo2_w64_neon_8:
+++1:      sub     r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        sub     r1, #2
+++        vld1.8  {q0-q1}, [r1]!
+++        vld1.8  {q2-q3}, [r1], r3
+++        sub     r1, #30
+++        subs    r12, #1
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++        // load b
+++        add     r1, #2
+++        vld1.8  {q8-q9}, [r1]!
+++        vld1.8  {q10-q11}, [r1]
+++        sub     r1, #34
+++        bl      edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r8,pc}
+++
+++ff_hevc_sao_edge_c_eo3_w64_neon_8:
+++1:      sub     r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        add     r1, #2
+++        vld1.8  {q0-q1}, [r1]!
+++        vld1.8  {q2-q3}, [r1], r3
+++        sub     r1, #34
+++        subs    r12, #1
+++        // load c
+++        vld1.8  {q4-q5}, [r1, :128]!
+++        vld1.8  {q6-q7}, [r1, :128], r3
+++        sub     r1, #32
+++        // load b
+++        sub     r1, #2
+++        vld1.8  {q8-q9}, [r1]!
+++        vld1.8  {q10-q11}, [r1]
+++        sub     r1, #30
+++        bl      edge_w64_body
+++        bne   1b
+++        vpop  {d8-d15}
+++        pop   {r4-r8,pc}
+ +endfunc
+ +
+++
+ +.macro init_edge_32
+ +        ldr     r12, [sp, #4] // sao_offset_val_table
+ +        vld1.32 {d31}, [r12]
+@@ -2175,7 +3272,7 @@ index 0000000..9c7808d
+ +        vext.8  q7, q11, q12, #8
+ +        vext.8  q5, q10, q11, #7
+ +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
+ +        vadd.s8 q0, q12 //diff0 + diff1
+ +        vadd.s8 q1, q13
+ +        table32
+@@ -2215,7 +3312,7 @@ index 0000000..9c7808d
+ +        vext.8  q14, q12, q10, #7
+ +
+ +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
+++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
+ +
+ +        vadd.s8 q0, q12 //diff0 + diff1
+ +        vadd.s8 q1, q13
+@@ -2427,26 +3524,21 @@ index ce4bab2..b9b0c78 100644
+ +    .split          = h264_split,
+ +};
+ diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index b478065..88dd40b 100644
++index b478065..955e426 100644
+ --- a/libavcodec/hevc.c
+ +++ b/libavcodec/hevc.c
+-@@ -41,8 +41,186 @@
++@@ -41,8 +41,196 @@
+  #include "hevc.h"
+  #include "profiles.h"
+  
+ +#ifdef RPI
+ +  #include "rpi_qpu.h"
+-+  #include "rpi_user_vcsm.h"
+-+  // Move Inter prediction into separate pass
+-+  #define RPI_INTER
+-+
+-+  #ifdef RPI_INTER_QPU
+-+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
+-+    #define RPI_MULTI_MAILBOX
+-+  #endif
+++  #include "rpi_shader.h"
+++  #include "rpi_shader_cmd.h"
+++  #include "rpi_zc.h"
+ +
+ +  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+-+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
+++  #define RPI_CACHE_UNIF_MVS  1
+ +
+ +  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
+ +  //#define RPI_SIMULATE_QPUS
+@@ -2454,19 +3546,24 @@ index b478065..88dd40b 100644
+ +    #include "pthread.h"
+ +  #endif
+ +
+-+  static void rpi_execute_dblk_cmds(HEVCContext *s);
+-+  static void rpi_execute_transform(HEVCContext *s);
+-+  static void rpi_launch_vpu_qpu(HEVCContext *s);
+-+  static void rpi_execute_pred_cmds(HEVCContext *s);
+-+  static void rpi_execute_inter_cmds(HEVCContext *s);
+-+  static void rpi_begin(HEVCContext *s);
+-+  static void flush_frame(HEVCContext *s,AVFrame *frame);
+-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
+++  static void worker_core(HEVCContext * const s);
+ +
+++  // We can pred any block height but annoyingly if we we do then the TMU cache
+++  // explodes and it goes even slower :-(
+++  #if 0
+++  #define Y_P_MAX_H     16
+++  #define Y_B_MAX_H     16
+++  #else
+++  #define Y_P_MAX_H     64
+++  #define Y_B_MAX_H     64
+++  #endif
+ +#endif
+ +
+ +// #define DISABLE_MC
+ +
+++#define DISABLE_CHROMA 0
+++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
+++
+ +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+ +
+ +#ifndef av_mod_uintp2
+@@ -2477,45 +3574,65 @@ index b478065..88dd40b 100644
+ +#   define av_mod_uintp2   av_mod_uintp2_c
+ +#endif
+ +
+++#define Y_B_ONLY 0
+++
+  const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+  
+ +
+-+#ifdef RPI_INTER_QPU
+++#if RPI_INTER
+++
+++#define MC_DUMMY_X (-32)
+++#define MC_DUMMY_Y (-32)
+ +
+ +// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+ +// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+ +// For each block of 64*64 the smallest block size is 8x4
+ +// We also need an extra command for the setup information
+ +
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+-+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
+++#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4))
+ +// The QPU code for UV blocks only works up to a block width of 8
+ +#define RPI_CHROMA_BLOCK_WIDTH 8
+ +
+-+#define RPI_LUMA_COMMAND_WORDS 10
+-+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
+-+
+ +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+ +
+ +// TODO Chroma only needs 4 taps
+ +
+ +// Actual filter goes -ve, +ve, +ve, -ve using these values
+-+static const uint32_t rpi_filter_coefs[8][1] = {
+-+        { ENCODE_COEFFS(   0,  64,   0,   0) },
+-+        { ENCODE_COEFFS(  2,  58,  10,  2) },
+-+        { ENCODE_COEFFS(  4,  54,  16,  2) },
+-+        { ENCODE_COEFFS(  6,  46,  28,  4) },
+-+        { ENCODE_COEFFS(  4,  36,  36,  4) },
+-+        { ENCODE_COEFFS(  4,  28,  46,  6) },
+-+        { ENCODE_COEFFS(  2,  16,  54,  4) },
+-+        { ENCODE_COEFFS(  2,  10,  58,  2) }
+++static const uint32_t rpi_filter_coefs[8] = {
+++        ENCODE_COEFFS(  0,  64,   0,  0),
+++        ENCODE_COEFFS(  2,  58,  10,  2),
+++        ENCODE_COEFFS(  4,  54,  16,  2),
+++        ENCODE_COEFFS(  6,  46,  28,  4),
+++        ENCODE_COEFFS(  4,  36,  36,  4),
+++        ENCODE_COEFFS(  4,  28,  46,  6),
+++        ENCODE_COEFFS(  2,  16,  54,  4),
+++        ENCODE_COEFFS(  2,  10,  58,  2)
+ +};
+ +
+++#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)))
+++
+ +#endif
+ +
+ +
+ +#ifdef RPI_WORKER
+ +
+++typedef struct worker_global_env_s
+++{
+++    volatile int arm_load;
+++    pthread_mutex_t lock;
+++
+++    unsigned int arm_y;
+++    unsigned int arm_c;
+++    unsigned int gpu_y;
+++    unsigned int gpu_c;
+++} worker_global_env_t;
+++
+++static worker_global_env_t worker_global_env =
+++{
+++    .lock = PTHREAD_MUTEX_INITIALIZER
+++};
+++
+++
+ +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+ +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+ +
+@@ -2594,17 +3711,7 @@ index b478065..88dd40b 100644
+ +      break;
+ +    }
+ +    LOG_ENTER
+-+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
+-+    rpi_launch_vpu_qpu(s);
+-+    // Perform inter prediction
+-+    rpi_execute_inter_cmds(s);
+-+    // Wait for transform completion
+-+    vpu_wait(s->vpu_id);
+-+
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+++    worker_core(s);
+ +
+ +    worker_complete_job(s);
+ +    LOG_EXIT
+@@ -2617,7 +3724,7 @@ index b478065..88dd40b 100644
+  /**
+   * NOTE: Each function hls_foo correspond to the function foo in the
+   * specification (HLS stands for High Level Syntax).
+-@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
++@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+  /* free everything allocated  by pic_arrays_init() */
+  static void pic_arrays_free(HEVCContext *s)
+  {
+@@ -2650,36 +3757,40 @@ index b478065..88dd40b 100644
+      av_freep(&s->sao);
+      av_freep(&s->deblock);
+  
+-@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
++@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+      int ctb_count        = sps->ctb_width * sps->ctb_height;
+      int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+  
+ +#ifdef RPI
+-+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
+-+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+-+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+++    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+++    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
+++    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+++    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+ +    int job;
+ +
+ +    av_assert0(sps);
+-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+-+    s->ctu_per_y_chan = s->max_ctu_count / 12;
+-+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
+++//    s->max_ctu_count = sps->ctb_width;
+++//    printf("CTB with=%d\n", sps->ctb_width);
+++//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+++    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
+++    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
+++    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
+++
+ +    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+      printf("Allocated %d\n",coefs_per_row);
+-+      for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+-+        if (!s->coeffs_buf_arm[job][0])
+-+            goto fail;
+-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+-+        if (!s->coeffs_buf_arm[job][2])
+-+            goto fail;
+-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+-+      }
+++        for(job=0;job<RPI_MAX_JOBS;job++) {
+++            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+++            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+++            if (!s->coeffs_buf_arm[job][0])
+++                goto fail;
+++
+++            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+++            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+++            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+++            if (!s->coeffs_buf_arm[job][2])
+++                goto fail;
+++            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+++            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+++        }
+ +    }
+ +#endif
+ +#ifdef RPI_DEBLOCK_VPU
+@@ -2726,8 +3837,6 @@ index b478065..88dd40b 100644
+ +
+ +            dvq->uv_setup_arm = (void*)p_arm;
+ +            dvq->uv_setup_vc = (void*)p_vc;
+-+
+-+            dvq->cmd_id = -1;
+ +        }
+ +
+ +        s->dvq_n = 0;
+@@ -2738,7 +3847,7 @@ index b478065..88dd40b 100644
+      s->bs_width  = (width  >> 2) + 1;
+      s->bs_height = (height >> 2) + 1;
+  
+-@@ -137,6 +422,29 @@ fail:
++@@ -137,6 +434,29 @@ fail:
+      return AVERROR(ENOMEM);
+  }
+  
+@@ -2768,7 +3877,52 @@ index b478065..88dd40b 100644
+  static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+  {
+      int i = 0;
+-@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
++@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
++ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
++ {
++     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
++-    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+++    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
++     int ret, i;
++ 
++     pic_arrays_free(s);
++@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++     switch (sps->pix_fmt) {
++     case AV_PIX_FMT_YUV420P:
++     case AV_PIX_FMT_YUVJ420P:
+++#if RPI_HEVC_SAND
+++        // Currently geometry calc is stuffed for big sizes
+++        if (sps->width < 2048 && sps->height <= 1088) {
+++            *fmt++ = AV_PIX_FMT_SAND128;
+++        }
+++#endif
++ #if CONFIG_HEVC_DXVA2_HWACCEL
++         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
++ #endif
++@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++         ret = ff_thread_get_format(s->avctx, pix_fmts);
++         if (ret < 0)
++             goto fail;
+++
++         s->avctx->pix_fmt = ret;
++     }
++     else {
++@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++         for(c_idx = 0; c_idx < c_count; c_idx++) {
++             int w = sps->width >> sps->hshift[c_idx];
++             int h = sps->height >> sps->vshift[c_idx];
+++            // ******** Very very nasty allocation kludge for plaited Chroma
++             s->sao_pixel_buffer_h[c_idx] =
++-                av_malloc((w * 2 * sps->ctb_height) <<
+++                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
++                           sps->pixel_shift);
++             s->sao_pixel_buffer_v[c_idx] =
++-                av_malloc((h * 2 * sps->ctb_width) <<
+++                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
++                           sps->pixel_shift);
++         }
++     }
++@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s)
+                  (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
+                  pred_weight_table(s, gb);
+              }
+@@ -2780,33 +3934,42 @@ index b478065..88dd40b 100644
+  
+              sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
+              if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
+-@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
++@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+      return 0;
+  }
+  
+ +#ifdef RPI
+ +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+ +{
+++    // U & V done on U call in the case of sliced frames
+++    if (rpi_sliced_frame(s->frame) && c_idx > 1)
+++        return;
+++
+ +    if (s->enable_rpi) {
+ +        HEVCLocalContext *lc = s->HEVClc;
+ +        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+ +        cmd->type = RPI_PRED_INTRA;
+ +        cmd->size = log2_trafo_size;
+-+        cmd->c_idx = c_idx;
+-+        cmd->x = x0;
+-+        cmd->y = y0;
+ +        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+-+        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+-+    } else {
+++        cmd->c_idx = c_idx;
+++        cmd->i_pred.x = x0;
+++        cmd->i_pred.y = y0;
+++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+++    }
+++    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
+++        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
+++    }
+++    else {
+ +        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+ +    }
+++
+ +}
+ +#endif
+ +
+  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                                int xBase, int yBase, int cb_xBase, int cb_yBase,
+                                int log2_cb_size, int log2_trafo_size,
+-@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+      if (lc->cu.pred_mode == MODE_INTRA) {
+          int trafo_size = 1 << log2_trafo_size;
+          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+@@ -2819,7 +3982,7 @@ index b478065..88dd40b 100644
+      }
+  
+      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+-@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                  if (lc->cu.pred_mode == MODE_INTRA) {
+                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+@@ -2831,7 +3994,7 @@ index b478065..88dd40b 100644
+                  }
+                  if (cbf_cb[i])
+                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+                  if (lc->cu.pred_mode == MODE_INTRA) {
+                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+@@ -2843,7 +4006,7 @@ index b478065..88dd40b 100644
+                  }
+                  if (cbf_cr[i])
+                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                  if (lc->cu.pred_mode == MODE_INTRA) {
+                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                      trafo_size_h, trafo_size_v);
+@@ -2855,7 +4018,7 @@ index b478065..88dd40b 100644
+                  }
+                  if (cbf_cb[i])
+                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+                  if (lc->cu.pred_mode == MODE_INTRA) {
+                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+                                                  trafo_size_h, trafo_size_v);
+@@ -2867,7 +4030,7 @@ index b478065..88dd40b 100644
+                  }
+                  if (cbf_cr[i])
+                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+@@ -2914,17 +4077,162 @@ index b478065..88dd40b 100644
+              }
+          }
+      }
+-@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
++@@ -1275,47 +1675,120 @@ do {
++     return 0;
++ }
++ 
++-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+++
+++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++ {
++-    HEVCLocalContext *lc = s->HEVClc;
++     GetBitContext gb;
++-    int cb_size   = 1 << log2_cb_size;
++-    int stride0   = s->frame->linesize[0];
++-    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
++-    int   stride1 = s->frame->linesize[1];
++-    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++-    int   stride2 = s->frame->linesize[2];
++-    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
++-
++-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
++-                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
++-                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
++-                          s->ps.sps->pcm.bit_depth_chroma;
++-    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
++     int ret;
++ 
++-    if (!s->sh.disable_deblocking_filter_flag)
++-        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
++-
++     ret = init_get_bits(&gb, pcm, length);
++     if (ret < 0)
++         return ret;
++ 
++-    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
++-    if (s->ps.sps->chroma_format_idc) {
++-        s->hevcdsp.put_pcm(dst1, stride1,
+++#ifdef RPI
+++    if (rpi_sliced_frame(s->frame)) {
+++        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
+++                           s->frame->linesize[0],
+++                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+++
+++        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
+++                           s->frame->linesize[1],
++                            cb_size >> s->ps.sps->hshift[1],
++                            cb_size >> s->ps.sps->vshift[1],
++                            &gb, s->ps.sps->pcm.bit_depth_chroma);
++-        s->hevcdsp.put_pcm(dst2, stride2,
++-                           cb_size >> s->ps.sps->hshift[2],
++-                           cb_size >> s->ps.sps->vshift[2],
++-                           &gb, s->ps.sps->pcm.bit_depth_chroma);
++     }
+++    else
+++#endif
+++    {
+++        const int stride0   = s->frame->linesize[0];
+++        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+++        const int   stride1 = s->frame->linesize[1];
+++        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+++        const int   stride2 = s->frame->linesize[2];
+++        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+++
+++        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+++        if (s->ps.sps->chroma_format_idc) {
+++            s->hevcdsp.put_pcm(dst1, stride1,
+++                               cb_size >> s->ps.sps->hshift[1],
+++                               cb_size >> s->ps.sps->vshift[1],
+++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+++            s->hevcdsp.put_pcm(dst2, stride2,
+++                               cb_size >> s->ps.sps->hshift[2],
+++                               cb_size >> s->ps.sps->vshift[2],
+++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+++        }
++ 
+++    }
++     return 0;
++ }
++ 
+++#ifdef RPI
+++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
+++{
+++    int16_t * const coeffs = (buf_no != 3) ?
+++        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
+++        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
+++    s->num_coeffs[s->pass0_job][buf_no] += n;
+++    return coeffs;
+++}
+++#endif
+++
+++// x * 2^(y*2)
+++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+++{
+++    return x << (y * 2);
+++}
+++
+++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
+++{
+++    // Length in bits
+++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
+++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
+++
+++    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
+++
+++    if (!s->sh.disable_deblocking_filter_flag)
+++        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+++
+++#ifdef RPI
+++    if (s->enable_rpi) {
+++        // Copy coeffs
+++        const int blen = (length + 7) >> 3;
+++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
+++        // Allocation is in int16_t s
+++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+++        // sample this rounding doesn't affect the total size we need to allocate for
+++        // the coeff buffer
+++        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
+++        memcpy(coeffs, pcm, blen);
+++
+++        // Our coeff stash assumes that any partially allocated 64byte lump
+++        // is zeroed so make that true.
+++        {
+++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+++            if ((-(intptr_t)eopcm & 63) != 0)
+++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
+++        }
+++
+++        // Add command
+++        {
+++            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+++            cmd->type = RPI_PRED_I_PCM;
+++            cmd->size = log2_cb_size;
+++            cmd->i_pcm.src = coeffs;
+++            cmd->i_pcm.x = x0;
+++            cmd->i_pcm.y = y0;
+++            cmd->i_pcm.src_len = length;
+++        }
+++        return 0;
+++    }
+++#endif
+++
+++    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
+++}
+++
++ /**
++  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
++  *
++@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+   * @param luma_offset additive offset applied to the luma prediction value
+   */
+  
+-+#ifdef RPI_INTER
+-+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
+++#if RPI_INTER
+ +static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+ +                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+ +                        int block_w, int block_h, int luma_weight, int luma_offset)
+ +{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
+ +    cmd->cmd = RPI_CMD_LUMA_UNI;
+ +    cmd->dst = dst;
+ +    cmd->dststride = dststride;
+@@ -2941,9 +4249,10 @@ index b478065..88dd40b 100644
+ +
+ +static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+ +                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
+++                       const struct MvField * const current_mv)
+ +{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
+ +    cmd->cmd = RPI_CMD_LUMA_BI;
+ +    cmd->dst = dst;
+ +    cmd->dststride = dststride;
+@@ -2961,17 +4270,17 @@ index b478065..88dd40b 100644
+ +    cmd->ref_idx[1] = current_mv->ref_idx[1];
+ +}
+ +
+-+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
+-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
+++static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
+++                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
+ +{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
+ +    cmd->cmd = RPI_CMD_CHROMA_UNI;
+ +    cmd->dst = dst0;
+ +    cmd->dststride = dststride;
+ +    cmd->src = src0;
+ +    cmd->srcstride = srcstride;
+-+    cmd->mv = current_mv->mv[reflist];
+++    cmd->mv = *mv;
+ +    cmd->x_off = x_off;
+ +    cmd->y_off = y_off;
+ +    cmd->block_w = block_w;
+@@ -2980,10 +4289,10 @@ index b478065..88dd40b 100644
+ +    cmd->offset = chroma_offset;
+ +}
+ +
+-+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
+++static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+++                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
+ +{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
+++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
+ +    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+ +    cmd->dst = dst0;
+ +    cmd->dststride = dststride;
+@@ -3001,14 +4310,12 @@ index b478065..88dd40b 100644
+ +    cmd->ref_idx[1] = current_mv->ref_idx[1];
+ +}
+ +
+-+#else
+-+#define RPI_REDIRECT(fn) fn
+ +#endif
+ +
+  static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                          AVFrame *ref, const Mv *mv, int x_off, int y_off,
+                          int block_w, int block_h, int luma_weight, int luma_offset)
+-@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+                             (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+      int idx              = ff_hevc_pel_weight[block_w];
+  
+@@ -3019,7 +4326,7 @@ index b478065..88dd40b 100644
+      x_off += mv->x >> 2;
+      y_off += mv->y >> 2;
+      src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+   * @param mv1 motion vector1 (relative to block position) to get pixel data from
+   * @param current_mv current motion vector structure
+   */
+@@ -3028,7 +4335,7 @@ index b478065..88dd40b 100644
+                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+  {
+-@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+      uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+      uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+  
+@@ -3039,7 +4346,7 @@ index b478065..88dd40b 100644
+      if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+          x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+          y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+-@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+      intptr_t _mx         = mx << (1 - hshift);
+      intptr_t _my         = my << (1 - vshift);
+  
+@@ -3050,7 +4357,7 @@ index b478065..88dd40b 100644
+      x_off += mv->x >> (2 + hshift);
+      y_off += mv->y >> (2 + vshift);
+      src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
++@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+      int hshift = s->ps.sps->hshift[1];
+      int vshift = s->ps.sps->vshift[1];
+  
+@@ -3061,13 +4368,422 @@ index b478065..88dd40b 100644
+      intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+      intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+      intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+-@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
++@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+      }
+  }
+  
+ -static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+ -                                int nPbW, int nPbH,
+ -                                int log2_cb_size, int partIdx, int idx)
+++
+++#if RPI_INTER
+++
+++static HEVCRpiLumaPred *
+++rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val)
+++{
+++    HEVCRpiLumaPred * yp = s->curr_pred_y;
+++    HEVCRpiLumaPred * ypt = yp + 1;
+++    for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) {
+++        if (ypt->load < yp->load)
+++            yp = ypt;
+++    }
+++
+++//        yp->load += load_val;
+++    ++yp->load;
+++    return yp;
+++}
+++
+++static void
+++rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
+++           const int nPbW, const int nPbH,
+++           const Mv *const mv,
+++           const int weight_mul,
+++           const int weight_offset,
+++           AVFrame *const src_frame)
+++{
+++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
+++
+++//    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
+++//                    mv, x0, y0, nPbW, nPbH,
+++//                    weight_mul, weight_offset);
+++
+++    {
+++        const unsigned int mx          = mv->x & 3;
+++        const unsigned int my          = mv->y & 3;
+++        const unsigned int my_mx       = (my << 8) | mx;
+++        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
+++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
+++        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
+++        uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
+++        const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
+++
+++        // Potentially we could change the assembly code to support taller sizes in one go
+++        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16)
+++        {
+++            const uint32_t src_yx_y = y1_m3 + start_y;
+++            int start_x = 0;
+++            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
+++
+++#if 1
+++            // As Y-pred operates on two independant 8-wide src blocks we can merge
+++            // this pred with the previous one if it the previous one is 8 pel wide,
+++            // the same height as the current block, immediately to the left of our
+++            // current dest block and mono-pred.
+++
+++            qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p;
+++            if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr)
+++            {
+++                const int bw = FFMIN(nPbW, 8);
+++                qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx;
+++
+++                last_y8_lx->next_src2_x = x1_m3;
+++                last_y8_lx->next_src2_y = src_yx_y;
+++                last_y8_lx->next_src2_base = src_vc_address_y;
+++                last_y8_p->p.w += bw;
+++                last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21);
+++                last_y8_p->p.wo2 = wo;
+++
+++                s->last_y8_p = NULL;
+++                s->last_y8_lx = NULL;
+++                start_x = bw;
+++#if RPI_TSTATS
+++                ++s->tstats.y_pred1_y8_merge;
+++#endif
+++            }
+++#endif
+++
+++            for (; start_x < nPbW; start_x += 16)
+++            {
+++                const int bw = FFMIN(nPbW - start_x, 16);
+++                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
+++                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
+++                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
+++#if RPI_TSTATS
+++                {
+++                    HEVCRpiStats *const ts = &s->tstats;
+++                    if (mx == 0 && my == 0)
+++                        ++ts->y_pred1_x0y0;
+++                    else if (mx == 0)
+++                        ++ts->y_pred1_x0;
+++                    else if (my == 0)
+++                        ++ts->y_pred1_y0;
+++                    else
+++                        ++ts->y_pred1_xy;
+++
+++                    if (nPbW > 8)
+++                        ++ts->y_pred1_wgt8;
+++                    else
+++                        ++ts->y_pred1_wle8;
+++
+++                    if (nPbH > 16)
+++                        ++ts->y_pred1_hgt16;
+++                    else
+++                        ++ts->y_pred1_hle16;
+++                }
+++#endif
+++                cmd_y[-1].next_fn = s->qpu_filter;
+++                cmd_lx->next_src1_x = x1_m3 + start_x;
+++                cmd_lx->next_src1_y = src_yx_y;
+++                cmd_lx->next_src1_base = src_vc_address_y;
+++                if (bw <= 8)
+++                {
+++                    cmd_lx->next_src2_x = MC_DUMMY_X;
+++                    cmd_lx->next_src2_y = MC_DUMMY_Y;
+++                    cmd_lx->next_src2_base = s->qpu_dummy_frame;
+++                }
+++                else
+++                {
+++                    cmd_lx->next_src2_x = x1_m3 + start_x + 8;
+++                    cmd_lx->next_src2_y = src_yx_y;
+++                    cmd_lx->next_src2_base = src_vc_address_y;
+++                }
+++                cmd_y->p.w = bw;
+++                cmd_y->p.h = bh;
+++                cmd_y->p.mymx21 = my2_mx2_my_mx;
+++                cmd_y->p.wo1 = wo;
+++                cmd_y->p.wo2 = wo;
+++                cmd_y->p.dst_addr =  dst_addr + start_x;
+++                yp->last_lx = cmd_y;
+++                yp->qpu_mc_curr = cmd_y + 1;
+++
+++                if (bw == 8) {
+++                    s->last_y8_lx = cmd_lx;
+++                    s->last_y8_p = cmd_y;
+++                }
+++            }
+++        }
+++    }
+++}
+++
+++static void
+++rpi_pred_y_b(HEVCContext * const s,
+++           const int x0, const int y0,
+++           const int nPbW, const int nPbH,
+++           const struct MvField *const mv_field,
+++           AVFrame *const src_frame,
+++           AVFrame *const src_frame2)
+++{
+++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
+++    const Mv * const mv  = mv_field->mv + 0;
+++    const Mv * const mv2 = mv_field->mv + 1;
+++
+++//    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
+++//           mv, x0, y0, nPbW, nPbH,
+++//           src_frame2, mv2, mv_field);
+++    {
+++        const unsigned int mx          = mv->x & 3;
+++        const unsigned int my          = mv->y & 3;
+++        const unsigned int my_mx = (my<<8) | mx;
+++        const unsigned int mx2          = mv2->x & 3;
+++        const unsigned int my2          = mv2->y & 3;
+++        const unsigned int my2_mx2 = (my2<<8) | mx2;
+++        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+++        const int x1 = x0 + (mv->x >> 2) - 3;
+++        const int y1 = y0 + (mv->y >> 2) - 3;
+++        const int x2 = x0 + (mv2->x >> 2) - 3;
+++        const int y2 = y0 + (mv2->y >> 2) - 3;
+++        const unsigned int ref_idx0 = mv_field->ref_idx[0];
+++        const unsigned int ref_idx1 = mv_field->ref_idx[1];
+++        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
+++                     s->sh.luma_offset_l1[ref_idx1] + 1;
+++        const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
+++        const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
+++
+++        uint32_t dst = get_vc_address_y(s->frame) + y_off;
+++        const uint32_t src1_base = get_vc_address_y(src_frame);
+++        const uint32_t src2_base = get_vc_address_y(src_frame2);
+++
+++        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H)
+++        {
+++            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
+++
+++            for (int start_x=0; start_x < nPbW; start_x += 8)
+++            { // B blocks work 8 at a time
+++                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
+++                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
+++                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
+++#if RPI_TSTATS
+++              {
+++                  HEVCRpiStats *const ts = &s->tstats;
+++                  const unsigned int mmx = mx | mx2;
+++                  const unsigned int mmy = my | my2;
+++                  if (mmx == 0 && mmy == 0)
+++                      ++ts->y_pred2_x0y0;
+++                  else if (mmx == 0)
+++                      ++ts->y_pred2_x0;
+++                  else if (mmy == 0)
+++                      ++ts->y_pred2_y0;
+++                  else
+++                      ++ts->y_pred2_xy;
+++
+++                  if (nPbH > 16)
+++                      ++ts->y_pred2_hgt16;
+++                  else
+++                      ++ts->y_pred2_hle16;
+++              }
+++#endif
+++              cmd_y[-1].next_fn = s->qpu_filter_b;
+++              cmd_lx->next_src1_x = x1 + start_x;
+++              cmd_lx->next_src1_y = y1 + start_y;
+++              cmd_lx->next_src1_base = src1_base;
+++              cmd_lx->next_src2_x = x2 + start_x;
+++              cmd_lx->next_src2_y = y2 + start_y;
+++              cmd_lx->next_src2_base = src2_base;
+++              cmd_y->p.w = FFMIN(nPbW - start_x, 8);
+++              cmd_y->p.h = bh;
+++              cmd_y->p.mymx21 = my2_mx2_my_mx;
+++              cmd_y->p.wo1 = wo1;
+++              cmd_y->p.wo2 = wo2;
+++              cmd_y->p.dst_addr =  dst + start_x;
+++              yp->last_lx = cmd_y;
+++              yp->qpu_mc_curr = cmd_y + 1;
+++          }
+++          dst += s->frame->linesize[0] * 16;
+++        }
+++    }
+++}
+++
+++
+++static HEVCRpiChromaPred *
+++rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val)
+++{
+++    HEVCRpiChromaPred * cp = s->curr_pred_c;
+++    HEVCRpiChromaPred * cpt = cp + 1;
+++    for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) {
+++        if (cpt->load < cp->load)
+++            cp = cpt;
+++    }
+++    // Actual use of load_val is noticably better but we haven't sorted Q length problems yet
+++    ++cp->load;
+++//    cp->load += load_val;
+++    return cp;
+++}
+++
+++static void
+++rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
+++  const int nPbW_c, const int nPbH_c,
+++  const Mv * const mv,
+++  const int16_t * const c_weights,
+++  const int16_t * const c_offsets,
+++  AVFrame * const src_frame)
+++{
+++
+++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
+++#if 0
+++    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
+++
+++    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
+++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
+++                c_weights[0], c_offsets[0]);
+++
+++    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
+++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
+++                c_weights[1], c_offsets[1]);
+++#endif
+++    {
+++        const int hshift           = s->ps.sps->hshift[1];
+++        const int vshift           = s->ps.sps->vshift[1];
+++
+++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+++        const uint32_t src_base_u = get_vc_address_u(src_frame);
+++        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
+++        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
+++        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
+++        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
+++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
+++
+++        for(int start_y=0;start_y < nPbH_c;start_y+=16)
+++        {
+++            const int bh = FFMIN(nPbH_c-start_y, 16);
+++
+++            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+++            {
+++                HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3);
+++                qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
+++                qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
+++                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+++
+++                u[-1].next_fn  = s->qpu_filter_uv;
+++                last_l0->next_src_x = x1_c + start_x;
+++                last_l0->next_src_y = y1_c + start_y;
+++                last_l0->next_src_base_c = src_base_u;
+++                u[0].p.h = bh;
+++                u[0].p.w = bw;
+++                u[0].p.coeffs_x = x_coeffs;
+++                u[0].p.coeffs_y = y_coeffs;
+++                u[0].p.wo_u = wo_u;
+++                u[0].p.wo_v = wo_v;
+++                u[0].p.dst_addr_c = dst_base_u + start_x * 2;
+++                cp->last_l0 = u;
+++                cp->qpu_mc_curr = u + 1;
+++            }
+++
+++            dst_base_u += s->frame->linesize[1] * 16;
+++        }
+++    }
+++  return;
+++}
+++
+++static void
+++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
+++  const int nPbW_c, const int nPbH_c,
+++  const struct MvField * const mv_field,
+++  const int16_t * const c_weights,
+++  const int16_t * const c_offsets,
+++  const int16_t * const c_weights2,
+++  const int16_t * const c_offsets2,
+++  AVFrame * const src_frame,
+++  AVFrame * const src_frame2)
+++{
+++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
+++#if 0
+++    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
+++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
+++
+++    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
+++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
+++#endif
+++    {
+++        const int hshift = s->ps.sps->hshift[1];
+++        const int vshift = s->ps.sps->vshift[1];
+++        const Mv * const mv = mv_field->mv + 0;
+++        const Mv * const mv2 = mv_field->mv + 1;
+++
+++        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
+++        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
+++        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+++        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+++
+++        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+++        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+++        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+++        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+++
+++        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
+++        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
+++
+++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
+++
+++        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
+++          const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
+++
+++          // We are allowed 3/4 powers of two as well as powers of 2
+++          av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
+++
+++          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) {
+++              const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+++
+++              HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3);
+++              qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
+++              qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
+++              qpu_mc_pred_c_t * const last_l1 = cp->last_l1;
+++
+++              u[-1].next_fn = s->qpu_filter_uv_b0;
+++              last_l0->next_src_x = x1_c + start_x;
+++              last_l0->next_src_y = y1_c + start_y;
+++              last_l0->next_src_base_c = get_vc_address_u(src_frame);
+++
+++              u[0].next_fn = 0;  // Ignored - 2 block cmd
+++              u[0].next_src_x = x2_c + start_x;
+++              u[0].next_src_y = y2_c + start_y;
+++              u[0].next_src_base_c = get_vc_address_u(src_frame2);
+++
+++              u[0].b0.h = (bh<16 ? bh : 16);
+++              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
+++              u[0].b0.coeffs_x = coefs0_x;
+++              u[0].b0.coeffs_y = coefs0_y;
+++              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
+++              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
+++              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
+++
+++              last_l1->next_src_x = x2_c + start_x;
+++              last_l1->next_src_y = y2_c + start_y;
+++              last_l1->next_src_base_c = get_vc_address_u(src_frame2);
+++
+++              u[1].b1.dummy0 = 0;  // w,h inherited from b0
+++              u[1].b1.coeffs_x = coefs1_x;
+++              u[1].b1.coeffs_y = coefs1_y;
+++              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
+++              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
+++              u[1].b1.dst_addr_c = dst_base_u + start_x * 2;
+++
+++              cp->last_l0 = u;
+++              cp->last_l1 = u + 1;
+++              cp->qpu_mc_curr = u + 2;
+++          }
+++
+++          dst_base_u += s->frame->linesize[1] * 16;
+++        }
+++    }
+++}
+++#endif
+++
+++
+++
+ +static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+ +                                const int nPbW, const int nPbH,
+ +                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+@@ -3080,7 +4796,7 @@ index b478065..88dd40b 100644
+      int merge_idx = 0;
+      struct MvField current_mv = {{{ 0 }}};
+  
+-@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+      int y_cb             = y0 >> log2_min_cb_size;
+      int x_pu, y_pu;
+      int i, j;
+@@ -3090,315 +4806,112 @@ index b478065..88dd40b 100644
+  
+      if (!skip_flag)
+          lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
+-@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+  
+ -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#ifdef RPI_LUMA_QPU
+++#if RPI_INTER
+ +        if (s->enable_rpi) {
+-+            const Mv * const mv    = &current_mv.mv[0];
+-+            const unsigned int mx          = mv->x & 3;
+-+            const unsigned int my          = mv->y & 3;
+-+            const unsigned int my_mx       = (my<<8) | mx;
+-+            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+-+            const int x1_m3 = x0 + (mv->x >> 2) - 3;
+-+            const int y1_m3 = y0 + (mv->y >> 2) - 3;
+-+            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
+-+            uint32_t * y = s->curr_y_mvs;
+-+
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
+-+
+-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  const int bw = nPbW-start_x;
+-+                  const int bh = nPbH-start_y;
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
+-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
+-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-+                }
+-+            }
+-+            s->curr_y_mvs = y;
+++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
+++              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+++              ref0->frame);
+ +        } else
+ +#endif
+ +        {
+-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
+++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+                      &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                      s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+                      s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+ +        }
+  
+          if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-+#ifdef RPI_INTER_QPU
+-+          if (s->enable_rpi) {
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[0];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+
+-+                uint32_t *u = s->curr_u_mvs;
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
+-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
+-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
+-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->curr_u_mvs = u;
+++#if RPI_INTER
+++            if (s->enable_rpi) {
+++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
+++                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+++                  ref0->frame);
+ +                return;
+ +            }
+ +#endif
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
++             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
+-         }
+-@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+  
+ -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+-+#ifdef RPI_LUMA_QPU
+++#if RPI_INTER
+ +        if (s->enable_rpi) {
+-+            const int reflist = 1;
+-+            const Mv *mv    = &current_mv.mv[reflist];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            uint32_t *y = s->curr_y_mvs;
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
+-+                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
+-+                }
+-+            }
+-+            s->curr_y_mvs = y;
+++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
+++              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+++              ref1->frame);
+ +        } else
+ +#endif
+-+
+ +        {
+-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
+++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+                      &current_mv.mv[1], x0, y0, nPbW, nPbH,
+                      s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+                      s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+ +        }
+  
+          if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-+#ifdef RPI_INTER_QPU
+++#if RPI_INTER
+ +            if (s->enable_rpi) {
+-+                const int reflist = 1;
+-+                const int hshift           = s->ps.sps->hshift[1];
+-+                const int vshift           = s->ps.sps->vshift[1];
+-+                const Mv * const mv        = &current_mv.mv[reflist];
+-+                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                const intptr_t _mx         = mx << (1 - hshift);
+-+                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+
+-+                const int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                const int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+
+-+                uint32_t * u = s->curr_u_mvs;
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      const int bw = nPbW_c-start_x;
+-+                      const int bh = nPbH_c-start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
+-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
+-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
+-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->curr_u_mvs = u;
+++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
+++                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+++                  ref1->frame);
+ +                return;
+ +            }
+ +#endif
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
++             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+- 
+--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
+-         }
+-@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+  
+ -        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#ifdef RPI_LUMA_QPU
+-+        if (s->enable_rpi && 0) {
+-+            const Mv *mv    = &current_mv.mv[0];
+-+            int mx          = mv->x & 3;
+-+            int my          = mv->y & 3;
+-+            int my_mx = (my<<8) + mx;
+-+            const Mv *mv2    = &current_mv.mv[1];
+-+            int mx2          = mv2->x & 3;
+-+            int my2          = mv2->y & 3;
+-+            int my2_mx2 = (my2<<8) + mx2;
+-+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
+-+            int x1 = x0 + (mv->x >> 2);
+-+            int y1 = y0 + (mv->y >> 2);
+-+            int x2 = x0 + (mv2->x >> 2);
+-+            int y2 = y0 + (mv2->y >> 2);
+-+            uint32_t *y = s->curr_y_mvs;
+-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
+-+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
+-+                  int bw = nPbW-start_x;
+-+                  int bh = nPbH-start_y;
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
+-+                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
+-+                  *y++ = my2_mx2_my_mx;
+-+
+-+                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+-+                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
+-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
+-+                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
+-+
+-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
+-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
+-+                }
+-+            }
+-+            s->curr_y_mvs = y;
+++#if RPI_INTER
+++        if (s->enable_rpi) {
+++            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+ +        } else
+ +#endif
+ +        {
+-+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
+++            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+                     ref1->frame, &current_mv.mv[1], &current_mv);
+ +        }
+  
+          if (s->ps.sps->chroma_format_idc) {
+--            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-+#ifdef RPI_INTER_QPU
+++#if RPI_INTER
+ +          if (s->enable_rpi) {
+-+                int hshift           = s->ps.sps->hshift[1];
+-+                int vshift           = s->ps.sps->vshift[1];
+-+                const Mv *mv         = &current_mv.mv[0];
+-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
+-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
+-+                intptr_t _mx         = mx << (1 - hshift);
+-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
+-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
+-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
+-+
+-+                const Mv *mv2         = &current_mv.mv[1];
+-+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
+-+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
+-+                intptr_t _mx2         = mx2 << (1 - hshift);
+-+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
+-+
+-+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
+-+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
+-+
+-+
+-+                uint32_t *u = s->curr_u_mvs;
+-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
+-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
+-+                      int bw = nPbW_c-start_x;
+-+                      int bh = nPbH_c-start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-+                      *u++ = rpi_filter_coefs[_mx][0];
+-+                      *u++ = rpi_filter_coefs[_my][0];
+-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
+-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
+-+                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
+-+                      *u++ = 0;
+-+
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
+-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
+-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
+-+                      *u++ = rpi_filter_coefs[_mx2][0];
+-+                      *u++ = rpi_filter_coefs[_my2][0];
+-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
+-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
+-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
+-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
+-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
+-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
+-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
+-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
+-+                    }
+-+                }
+-+                s->curr_u_mvs = u;
+++              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
+++                           &current_mv,
+++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+++                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+++                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+++                           ref0->frame,
+++                           ref1->frame);
+ +                return;
+ +            }
+ +#endif
+-+            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
++             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+  
+--            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+-+            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
+-         }
+-     }
+-@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
++@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
++                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
++                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
++                 if (s->ps.sps->pcm.loop_filter_disable_flag)
+++                {
++                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
+++                }
++ 
++                 if (ret < 0)
++                     return ret;
++@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+      lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+  }
+  
+@@ -3415,6 +4928,7 @@ index b478065..88dd40b 100644
+ +    s->num_dblk_cmds[job] = 0;
+ +}
+ +
+++#if 0
+ +static void rpi_execute_transform(HEVCContext *s)
+ +{
+ +    int i=2;
+@@ -3430,7 +4944,7 @@ index b478065..88dd40b 100644
+ +        s->hevcdsp.idct[5-2](coeffs, 32);
+ +    }*/
+ +
+-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
+++    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+ +    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+ +                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+ +                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+@@ -3441,12 +4955,16 @@ index b478065..88dd40b 100644
+ +    for(i=0;i<4;i++)
+ +        s->num_coeffs[job][i] = 0;
+ +}
+++#endif
+++
+ +
+-+static void rpi_execute_pred_cmds(HEVCContext *s)
+++// I-pred, transform_and_add for all blocks types done here
+++// All ARM
+++static void rpi_execute_pred_cmds(HEVCContext * const s)
+ +{
+ +  int i;
+ +  int job = s->pass1_job;
+-+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+++  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+ +#ifdef RPI_WORKER
+ +  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+ +#else
+@@ -3454,43 +4972,65 @@ index b478065..88dd40b 100644
+ +#endif
+ +
+ +  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+-+      if (cmd->type == RPI_PRED_INTRA) {
+-+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
+-+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+-+          lc->na.cand_left         = (cmd->na >> 3) & 1;
+-+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+-+          lc->na.cand_up           = (cmd->na >> 1) & 1;
+-+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
+-+      } else {
+-+#ifdef RPI_PRECLEAR
+-+          int trafo_size = 1 << cmd->size;
+-+#endif
+-+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
+++//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+++
+++      switch (cmd->type)
+++      {
+++          case RPI_PRED_INTRA:
+++              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
+++              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+++              lc->na.cand_left         = (cmd->na >> 3) & 1;
+++              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+++              lc->na.cand_up           = (cmd->na >> 1) & 1;
+++              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+++              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
+++                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+++              else
+++                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+++              break;
+++
+++          case RPI_PRED_ADD_RESIDUAL:
+++              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+ +#ifdef RPI_PRECLEAR
+-+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
+++              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
+ +#endif
+++              break;
+++          case RPI_PRED_ADD_RESIDUAL_U:
+++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+++              break;
+++          case RPI_PRED_ADD_RESIDUAL_V:
+++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+++              break;
+++
+++          case RPI_PRED_I_PCM:
+++              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+++              break;
+++
+++          default:
+++              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+++              abort();
+ +      }
+ +  }
+ +  s->num_pred_cmds[job] = 0;
+ +}
+ +
+-+static void rpi_execute_inter_cmds(HEVCContext *s)
+++// Do any inter-pred that we want to do in software
+++// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
+++// All ARM
+++static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
+ +{
+-+    int job = s->pass1_job;
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
+-+    int n,cidx;
+++    unsigned int cidx;
+ +    AVFrame myref;
+ +    AVFrame myref1;
+ +    struct MvField mymv;
+-+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
+-+        printf("Overflow inter_cmds\n");
+-+        exit(-1);
+-+    }
+-+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
+++
+++    for(; n>0 ; n--, cmd++) {
+++        av_assert0(0);
+++
+ +        switch(cmd->cmd) {
+ +        case RPI_CMD_LUMA_UNI:
+++            if (b_only)
+++                break;
+ +            myref.data[0] = cmd->src;
+ +            myref.linesize[0] = cmd->srcstride;
+ +            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+@@ -3507,6 +5047,8 @@ index b478065..88dd40b 100644
+ +                       &myref1, &cmd->mv1, &mymv);
+ +            break;
+ +        case RPI_CMD_CHROMA_UNI:
+++            if (b_only)
+++                break;
+ +            mymv.mv[0] = cmd->mv;
+ +            chroma_mc_uni(s, cmd->dst,
+ +                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+@@ -3528,618 +5070,385 @@ index b478065..88dd40b 100644
+ +            break;
+ +        }
+ +    }
+-+    s->num_mv_cmds[job] = 0;
+ +}
+ +
+-+static void rpi_do_all_passes(HEVCContext *s)
+++static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
+ +{
+-+    // Kick off QPUs and VPUs
+-+    rpi_launch_vpu_qpu(s);
+-+    // Perform luma inter prediction
+-+    rpi_execute_inter_cmds(s);
+-+    // Wait for transform completion
+-+    vpu_wait(s->vpu_id);
+-+    // Perform intra prediction and residual reconstruction
+-+    rpi_execute_pred_cmds(s);
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+-+    // Prepare next batch
+-+    rpi_begin(s);
+++    const int job = s->pass1_job;
+++
+++    if (!qpu_luma || luma_b_only)
+++        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
+++    s->num_mv_cmds_y[job] = 0;
+++    if (!qpu_chroma || chroma_b_only)
+++        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
+++    s->num_mv_cmds_c[job] = 0;
+ +}
+ +
+ +#endif
+ +
+ +#ifdef RPI
+++// Set initial uniform job values & zero ctu_count
+ +static void rpi_begin(HEVCContext *s)
+ +{
+++#if RPI_INTER
+ +    int job = s->pass0_job;
+ +    int i;
+-+#ifdef RPI_INTER_QPU
+-+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
+-+
+-+    for(i=0;i<8;i++) {
+-+        s->u_mvs[job][i] = s->mvs_base[job][i];
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = pic_width;
+-+        *s->u_mvs[job][i]++ = pic_height;
+-+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
+-+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
+-+        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
+-+        *s->u_mvs[job][i]++ = 0;
+-+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
+-+    }
+-+    s->curr_u_mvs = s->u_mvs[job][0];
+-+#endif
+ +
+-+#ifdef RPI_LUMA_QPU
+-+    for(i=0;i<12;i++) {
+-+        // This needs to have a generally similar structure to the
+-+        // actual filter code as various pipelined bits need to land correctly
+-+        // when inserted by the filter requests
+-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
+-+        *s->y_mvs[job][i]++ = 0; // y_x
+-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
+-+        *s->y_mvs[job][i]++ = 0; // y2_x2
+-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
+-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
+-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
+-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
+-+        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
+-+        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
+-+        *s->y_mvs[job][i]++ = 0; // Next kernel
+++    const uint16_t pic_width_y        = s->ps.sps->width;
+++    const uint16_t pic_height_y       = s->ps.sps->height;
+++
+++    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
+++    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
+++
+++    for(i=0; i < QPU_N_UV;i++) {
+++        HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i;
+++        qpu_mc_pred_c_t * u = cp->qpu_mc_base;
+++
+++        // Chroma setup is a double block with L0 fetch
+++        // and other stuff in the 1st block and L1 fetch
+++        // in the 2nd along with a lot of dummy vars
+++        // This could be packed a lot tighter but it would make
+++        // L0, L1 management a lot harder
+++
+++        u->next_fn = 0;
+++        u->next_src_x = 0;
+++        u->next_src_y = 0;
+++        u->next_src_base_c = 0;
+++        u->s0.pic_cw = pic_width_c;
+++        u->s0.pic_ch = pic_height_c;
+++        u->s0.stride2 = rpi_sliced_frame_stride2(s->frame);
+++        u->s0.stride1 = s->frame->linesize[1];
+++        u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6;
+++        u->s0.dummy0 = 0;
+++        cp->last_l0 = u;
+++        ++u;
+++
+++        u->next_fn = 0;
+++        u->next_src_x = 0;
+++        u->next_src_y = 0;
+++        u->next_src_base_c = 0;
+++        u->s1.dummy0 = 0;
+++        u->s1.dummy1 = 0;
+++        u->s1.dummy2 = 0;
+++        u->s1.dummy3 = 0;
+++        u->s1.dummy4 = 0;
+++        u->s1.dummy5 = 0;
+++        cp->last_l1 = u;
+++        ++u;
+++
+++        cp->load = 0;
+++        cp->qpu_mc_curr = u;
+++    }
+++    s->curr_pred_c = NULL;
+++
+++    for(i=0;i < QPU_N_Y;i++) {
+++        HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i;
+++        qpu_mc_pred_y_t * y = yp->qpu_mc_base;
+++
+++        y->next_src1_x = 0;
+++        y->next_src1_y = 0;
+++        y->next_src1_base = 0;
+++        y->next_src2_x = 0;
+++        y->next_src2_y = 0;
+++        y->next_src2_base = 0;
+++        y->s.pic_h = pic_height_y;
+++        y->s.pic_w = pic_width_y;
+++        y->s.stride2 = rpi_sliced_frame_stride2(s->frame);
+++        y->s.stride1 = s->frame->linesize[0];
+++        y->s.wdenom = s->sh.luma_log2_weight_denom + 6;
+++        y->s.dummy0 = 0;
+++        y->next_fn = 0;
+++        yp->last_lx = y;
+++        ++y;
+++
+++        yp->load = 0;
+++        yp->qpu_mc_curr = y;
+ +    }
+-+    s->curr_y_mvs = s->y_mvs[job][0];
+++    s->curr_pred_y = NULL;
+++    s->last_y8_p = NULL;
+++    s->last_y8_lx = NULL;
+ +#endif
+ +    s->ctu_count = 0;
+ +}
+ +#endif
+ +
+-+#ifdef RPI_SIMULATE_QPUS
+ +
+-+static int32_t clipx(int x,int FRAME_WIDTH)
+++#if RPI_INTER
+++static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
+ +{
+-+	if (x<=0) return 0;
+-+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
+-+	return x;
+-+}
+++    unsigned int i;
+++    const uint32_t exit_fn = qpu_fn(mc_exit);
+++    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
+++    unsigned int tc = 0;
+++    HEVCRpiJob * const jb = s->jobs + job;
+++
+++    // Add final commands to Q
+++    for(i = 0; i != QPU_N_Y; ++i) {
+++        HEVCRpiLumaPred * const yp = jb->luma_mvs + i;
+++        qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx;
+++
+++        // We will always have had L0 if we have L1 so only test L0
+++        if (px != yp->qpu_mc_base)
+++            tc = 1;
+++
+++        yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
+++
+++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+++        px->next_src1_x = MC_DUMMY_X;
+++        px->next_src1_y = MC_DUMMY_Y;
+++        px->next_src1_base = s->qpu_dummy_frame;
+++        px->next_src2_x = MC_DUMMY_X;
+++        px->next_src2_y = MC_DUMMY_Y;
+++        px->next_src2_base = s->qpu_dummy_frame;
+++
+++        yp->last_lx = NULL;
+++    }
+ +
+-+static int32_t clipy(int y,int FRAME_HEIGHT)
+-+{
+-+	if (y<=0) return 0;
+-+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
+-+	return y;
+++    return tc;
+ +}
+ +
+-+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
+-+{
+-+   int32_t vsum = 0;
+-+   int x, y;
+++#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c
+++#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n)
+ +
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+++static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
+++{
+++    unsigned int i;
+++    const uint32_t exit_fn = qpu_fn(mc_exit_c);
+++    const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV));
+++    unsigned int tc = 0;
+++    HEVCRpiJob * const jb = s->jobs + job;
+++
+++    // Add final commands to Q
+++    for(i = 0; i != QPU_N_UV; ++i) {
+++        HEVCRpiChromaPred * const cp = jb->chroma_mvs + i;
+++        qpu_mc_pred_c_t *const p0 = cp->last_l0;
+++        qpu_mc_pred_c_t *const p1 = cp->last_l1;
+++
+++        // We will always have had L0 if we have L1 so only test L0
+++        if (p0 != cp->qpu_mc_base)
+++            tc = 1;
+++
+++        cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
+++
+++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+++        p0->next_src_x = MC_DUMMY_X;
+++        p0->next_src_y = MC_DUMMY_Y;
+++        p0->next_src_base_c = s->qpu_dummy_frame;
+++        p1->next_src_x = MC_DUMMY_X;
+++        p1->next_src_y = MC_DUMMY_Y;
+++        p1->next_src_base_c = s->qpu_dummy_frame;;
+++
+++        cp->last_l0 = NULL;
+++        cp->last_l1 = NULL;
+++    }
+ +
+-+      for (x = 0; x < 8; x++)
+-+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
+++    return tc;
+++}
+++#endif
+ +
+-+      vsum += lumaFilter[my][y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+round)>>denom)+offset;
+++#ifdef RPI
+ +
+-+   return av_clip_uint8( vsum );
+-+}*/
+ +
+-+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+++static void flush_frame(HEVCContext *s,AVFrame *frame)
+ +{
+-+  int32_t vsum = 0;
+-+  int x, y;
+-+  int chromaFilterH[4];
+-+  int chromaFilterV[4];
+-+  int i;
+-+  int offset_after = offset_weight>>16;
+-+  int weight = (offset_weight<<16)>>16;
+-+  for(i=0;i<4;i++) {
+-+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
+-+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
+-+  }
+++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+++  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+++  rpi_cache_flush_finish(rfe);
+++}
+ +
+-+   for (y = 0; y < 4; y++) {
+-+      int32_t hsum = 0;
+ +
+-+      for (x = 0; x < 4; x++)
+-+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+++// Core execution tasks
+++static void worker_core(HEVCContext * const s)
+++{
+++    worker_global_env_t * const wg = &worker_global_env;
+++    int arm_cost = 0;
+++//    vpu_qpu_wait_h sync_c;
+++    vpu_qpu_wait_h sync_y;
+++    int qpu_luma = 0;
+++    int qpu_chroma = 0;
+++    int gpu_load;
+++    int arm_load;
+++    static const int arm_const_cost = 2;
+++
+++//    static int z = 0;
+++
+++    const int job = s->pass1_job;
+++    unsigned int flush_start = 0;
+++    unsigned int flush_count = 0;
+++
+++    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++
+++    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
+++        vpu_qpu_job_add_vpu(vqj,
+++            vpu_get_fn(),
+++            vpu_get_constants(),
+++            s->coeffs_buf_vc[job][2],
+++            s->num_coeffs[job][2] >> 8,
+++            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+++            s->num_coeffs[job][3] >> 10,
+++            0);
+ +
+-+      vsum += chromaFilterV[y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+++        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+++    }
+ +
+-+   return vsum;
+-+}
+ +
+-+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
+++#if RPI_INTER
+++    pthread_mutex_lock(&wg->lock);
+ +
+-+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
+-+{
+-+  int32_t vsum = 0;
+-+  int x, y;
+-+  int i;
+-+  int offset_after = offset_weight>>16;
+-+  int weight = (offset_weight<<16)>>16;
+++//    ++z;
+++    gpu_load = vpu_qpu_current_load();
+++    arm_load = avpriv_atomic_int_get(&wg->arm_load);
+++#if 0 // Y_B_ONLY
+++    qpu_luma =  gpu_load + 2 < arm_load;
+++    qpu_chroma = gpu_load < arm_load + 8;
+++#elif 0
+++    qpu_luma =  gpu_load < arm_load + 2;
+++    qpu_chroma = gpu_load < arm_load + 8;
+++#else
+++    qpu_chroma = 1;
+++    qpu_luma = 1;
+++#endif
+ +
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+++    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
+++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
+ +
+-+      for (x = 0; x < 8; x++)
+-+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
+++    wg->gpu_c += qpu_chroma;
+++    wg->gpu_y += qpu_luma;
+++    wg->arm_c += !qpu_chroma;
+++    wg->arm_y += !qpu_luma;
+ +
+-+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
+-+   }
+-+   vsum >>= 6;
+-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
+ +
+-+   return vsum;
+-+}
+++//    if ((z & 511) == 0) {
+++//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
+++//    }
+ +
+-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
+-+{
+-+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
+-+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
+-+  int pitch = frame->linesize[cIdx];
+-+  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
+-+    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
+-+  if (p>=base && p<base+pitch*pic_height) {
+-+    return frame->data[cIdx] + (p-base);
+-+  }
+-+  return NULL;
+-+}
+ +
+-+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
+-+{
+-+  SliceHeader *sh   = &s->sh;
+-+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
+-+  int i;
+-+  if (arm) return arm;
+-+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
+-+  {
+-+    for(i=0;i<sh->nb_refs[L0];i++) {
+-+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
+-+      if (arm) return arm;
+-+    }
+-+  }
+-+  if (sh->slice_type == B_SLICE) {
+-+    for(i=0;i<sh->nb_refs[L1];i++) {
+-+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
+-+      if (arm) return arm;
+++    {
+++        int (*d)[2] = s->dblk_cmds[job];
+++        unsigned int high=(*d)[1];
+++        int n;
+++
+++        flush_start = high;
+++        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+++            unsigned int y = (*d)[1];
+++            flush_start = FFMIN(flush_start, y);
+++            high=FFMAX(high,y);
+++        }
+++        // Avoid flushing past end of frame
+++        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
+ +    }
+-+  }
+-+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
+-+  exit(-1);
+-+  return NULL;
+-+}
+ +
+-+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
+-+{
+-+  uint32_t next_kernel;
+-+  uint32_t x0;
+-+  uint32_t y0;
+-+  uint8_t *ref_u_base;
+-+  uint8_t *ref_v_base;
+-+  uint32_t frame_width = p[5];
+-+  uint32_t frame_height = p[6];
+-+  uint32_t pitch = p[7];
+-+  uint32_t dst_pitch = p[8];
+-+  int32_t offset_before = p[9];
+-+  int32_t denom = p[10];
+-+  uint32_t vpm_id = p[11];
+-+  uint32_t tmp_u_dst[256];
+-+  uint32_t tmp_v_dst[256];
+-+  while(1) {
+-+    p += 12;
+-+    next_kernel = p[0-12];
+-+    x0 = p[1-12];
+-+    y0 = p[2-12];
+-+    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
+-+      int x,y;
+-+      uint32_t width_height = p[5];
+-+      uint32_t hcoeffs = p[6];
+-+      uint32_t vcoeffs = p[7];
+-+      uint32_t offset_weight_u = p[8];
+-+      uint32_t offset_weight_v = p[9];
+-+      uint8_t *this_u_dst;
+-+      uint8_t *this_v_dst;
+-+      uint32_t width = width_height >> 16;
+-+      uint32_t height = (width_height << 16) >> 16;
+-+      ref_u_base = compute_arm_addr(s,p[3-12],1);
+-+      ref_v_base = compute_arm_addr(s,p[4-12],2);
+-+      if (next_kernel!=s->mc_filter_uv_b0)
+-+      {
+-+        this_u_dst = compute_arm_addr(s,p[10],1);
+-+        this_v_dst = compute_arm_addr(s,p[11],2);
+-+      }
+-+      for (y=0; y<height; ++y) {
+-+        for (x=0; x<width; ++x) {
+-+          if (next_kernel==s->mc_filter_uv) {
+-+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
+-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          } else if (next_kernel==s->mc_filter_uv_b0) {
+-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
+-+            tmp_u_dst[x+y*16] = refa;
+-+            tmp_v_dst[x+y*16] = refb;
+-+          } else {
+-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
+-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
+-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
+-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          }
+-+        }
+-+      }
+-+    } else {
+-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+-+      break;
+-+    }
+-+  }
+-+}
+++#if !DISABLE_CHROMA
+++    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
+++    {
+++        HEVCRpiJob * const jb = s->jobs + job;
+++        const uint32_t code = qpu_fn(mc_setup_c);
+++        uint32_t * p;
+++        unsigned int i;
+++        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+ +
+-+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
+-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
+-+{
+-+  uint32_t next_kernel;
+-+  int y_x,y2_x2;
+-+  int x0;
+-+  int y0;
+-+  int x2;
+-+  int y2;
+-+  uint32_t *p0 = p;
+-+  uint8_t *ref_y_base;
+-+  uint8_t *ref_y2_base;
+-+  uint32_t frame_width_height = p[4];
+-+  uint32_t frame_width = frame_width_height>>16;
+-+  uint32_t frame_height = (frame_width_height<<16)>>16;
+-+  uint32_t pitch = p[5];
+-+  uint32_t dst_pitch = p[6];
+-+  int offset_shift = p[7];
+-+  int32_t offset_before = offset_shift>>16;
+-+  int32_t denom = (offset_shift<<16)>>16;
+-+  while(1) {
+-+    p += 9;
+-+    next_kernel = p[8-9];
+-+    y_x = p[0-9];
+-+    x0 = (y_x<<16)>>16;
+-+    y0 = y_x>>16;
+-+    y2_x2 = p[2-9];
+-+    x2 = (y2_x2<<16)>>16;
+-+    y2 = y2_x2>>16;
+-+
+-+    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
+-+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+-+      int x,y;
+-+      uint32_t width_height = p[4];
+-+      uint32_t my2_mx2_my_mx = p[5];
+-+      uint32_t offset_weight = p[6];
+-+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
+-+      uint32_t width = width_height >> 16;
+-+      uint32_t height = (width_height << 16) >> 16;
+-+      uint8_t *dst_base = s->frame->data[0];
+-+      ref_y_base = compute_arm_addr(s,p[1-9],0);
+-+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
+-+      for (y=0; y<height; ++y) {
+-+        for (x=0; x<width; ++x) {
+-+          if (next_kernel==s->mc_filter) {
+-+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
+-+            refa = av_clip_uint8(refa);
+-+            this_dst[x+y*dst_pitch] = refa;
+-+          }
+-+          else {
+-+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
+-+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
+-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
+-+          }
+++        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+++            *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm);
+++            *p++ = code;
+ +        }
+-+      }
+-+    } else {
+-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
+-+      break;
+-+    }
+-+  }
+-+}
+-+
+-+static void rpi_simulate_inter_qpu(HEVCContext *s)
+-+{
+-+  // First run the transform as normal
+-+  int i;
+-+  rpi_execute_transform(s);
+-+  for(i=0;i<8;i++)
+-+  {
+-+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
+-+  }
+-+  for(i=0;i<12;i++)
+-+  {
+-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
+-+  }
+-+}
+-+
+-+#endif
+ +
+-+#ifdef RPI_INTER_QPU
+++        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
+ +
+-+static void rpi_launch_vpu_qpu(HEVCContext *s)
+-+{
+-+    int k;
+-+    int job = s->pass1_job;
+-+    int i;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
+-+#ifdef RPI_LUMA_QPU
+-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
+++#if RPI_CACHE_UNIF_MVS
+++        rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+ +#endif
+-+    if (s->sh.slice_type == I_SLICE) {
+-+#ifdef RPI_MULTI_MAILBOX
+-+      rpi_execute_transform(s);
+-+      return;
+-+#endif
+-+    }
+-+    for(k=0;k<8;k++) {
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
+-+        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
+++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
+ +    }
+-+
+-+    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+
+-+#ifdef RPI_LUMA_QPU
+-+    for(k=0;k<12;k++) {
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
+-+        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
+-+        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
+-+    }
+-+    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+#endif
+-+
+-+#ifdef RPI_SIMULATE_QPUS
+-+    rpi_simulate_inter_qpu(s);
+-+    return;
+ +#endif
+ +
+-+#ifdef RPI_MULTI_MAILBOX
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
+-+#else
+-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
+-+#endif
+++// We can take a sync here and try to locally overlap QPU processing with ARM
+++// but testing showed a slightly negative benefit with noticable extra complexity
+++//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
+ +
+-+#if 1
+++    if (qpu_luma && mc_terminate_y(s, job) != 0)
+ +    {
+-+        unsigned int i;
+++        HEVCRpiJob * const jb = s->jobs + job;
+++        const uint32_t code = qpu_fn(mc_setup);
+ +        uint32_t * p;
+-+        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
+-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+++        unsigned int i;
+ +        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
+ +
+-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+-+            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
+-+            *p++ = code;
+-+        }
+-+
+-+        code = qpu_get_fn(QPU_MC_SETUP);
+ +        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
+-+            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
+++            *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm);
+ +            *p++ = code;
+ +        }
+ +
+-+        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
+-+            vpu_get_constants(),
+-+            s->coeffs_buf_vc[job][2],
+-+            s->num_coeffs[job][2] >> 8,
+-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+-+            s->num_coeffs[job][3] >> 10,
+-+            0,
+-+            // QPU job 1
+-+            QPU_N_UV,
+-+            mail_uv,
+-+            // QPU job 2
+-+            QPU_N_Y,
+-+            mail_y
+-+            );
+-+    }
+++        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
+ +
+-+#else
+-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
+-+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
+-+                                   qpu_get_fn(QPU_MC_SETUP_UV),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+#ifdef RPI_LUMA_QPU
+-+                                   qpu_get_fn(QPU_MC_SETUP),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
+-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
+-+#else
+-+                                   0,
+-+                                   0,0,0,0,
+-+                                   0,0,0,0,
+-+                                   0,0,0,0
+++#if RPI_CACHE_UNIF_MVS
+++        rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+ +#endif
+-+                                 );
+-+#endif
+-+    for(i=0;i<4;i++)
+-+        s->num_coeffs[job][i] = 0;
+-+#else
+-+#error Code rotted here
+-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
+-+      );
+-+#endif
+-+
+++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
+++    }
+ +
+-+}
+-+#else
+++    pthread_mutex_unlock(&wg->lock);
+ +
+-+#ifdef RPI
+-+static void rpi_launch_vpu_qpu(HEVCContext *s)
+-+{
+-+  rpi_execute_transform(s);
+-+}
+ +#endif
+ +
+-+#endif
+++    vpu_qpu_job_add_sync_this(vqj, &sync_y);
+ +
+-+#ifdef RPI
+++    // Having accumulated some commands - do them
+++    rpi_cache_flush_finish(rfe);
+++    vpu_qpu_job_finish(vqj);
+ +
+-+#ifndef RPI_FAST_CACHEFLUSH
+-+#error RPI_FAST_CACHEFLUSH is broken
+-+static void flush_buffer(AVBufferRef *bref) {
+-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
+-+    gpu_cache_flush(p);
+-+}
+++    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
+++
+++#if Y_B_ONLY
+++    if (qpu_luma)
+++        vpu_qpu_wait(&sync_y);
+ +#endif
+++    // Perform inter prediction
+++    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
+ +
+-+static void flush_frame(HEVCContext *s,AVFrame *frame)
+-+{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+-+    int n = s->ps.sps->height;
+-+    int curr_y = 0;
+-+    int curr_uv = 0;
+-+    int n_uv = n >> s->ps.sps->vshift[1];
+-+    int sz,base;
+-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+    base = s->frame->linesize[1] * curr_uv;
+-+    iocache.s[0].handle = p.vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int)(p.arm) + base;
+-+    iocache.s[0].size  = sz;
+-+    p = get_gpu_mem_ptr_v(s->frame);
+-+    iocache.s[1].handle = p.vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = (int)(p.arm) + base;
+-+    iocache.s[1].size  = sz;
+-+    p = get_gpu_mem_ptr_y(s->frame);
+-+    sz = s->frame->linesize[0] * (n-curr_y);
+-+    base = s->frame->linesize[0] * curr_y;
+-+    iocache.s[2].handle = p.vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = (int)(p.arm) + base;
+-+    iocache.s[2].size  = sz;
+-+    vcsm_clean_invalid( &iocache );
+++    // Wait for transform completion
+++
+++    // Perform intra prediction and residual reconstruction
+++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
+++#if Y_B_ONLY
+++    if (!qpu_luma)
+++        vpu_qpu_wait(&sync_y);
+ +#else
+-+    flush_buffer(frame->buf[0]);
+-+    flush_buffer(frame->buf[1]);
+-+    flush_buffer(frame->buf[2]);
+++    vpu_qpu_wait(&sync_y);
+ +#endif
+++    rpi_execute_pred_cmds(s);
+++
+++    // Perform deblocking for CTBs in this row
+++    rpi_execute_dblk_cmds(s);
+++
+++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
+ +}
+ +
+-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
+++static void rpi_do_all_passes(HEVCContext *s)
+ +{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    int n;
+-+    int curr_y;
+-+    int curr_uv;
+-+    int n_uv;
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
+-+    int sz,base;
+-+    int (*d)[2] = s->dblk_cmds[job];
+-+    int low=(*d)[1];
+-+    int high=(*d)[1];
+-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+-+        int y = (*d)[1];
+-+        low=FFMIN(low,y);
+-+        high=FFMAX(high,y);
+-+    }
+-+    curr_y = low;
+-+    n = high+(1 << s->ps.sps->log2_ctb_size);
+-+    curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+    n_uv = n >> s->ps.sps->vshift[1];
+-+
+-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+    base = s->frame->linesize[1] * curr_uv;
+-+    iocache.s[0].handle = p.vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int)(p.arm) + base;
+-+    iocache.s[0].size  = sz;
+-+    p = get_gpu_mem_ptr_v(s->frame);
+-+    iocache.s[1].handle = p.vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = (int)(p.arm) + base;
+-+    iocache.s[1].size  = sz;
+-+    p = get_gpu_mem_ptr_y(s->frame);
+-+    sz = s->frame->linesize[0] * (n-curr_y);
+-+    base = s->frame->linesize[0] * curr_y;
+-+    iocache.s[2].handle = p.vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = (int)(p.arm) + base;
+-+    iocache.s[2].size  = sz;
+-+
+-+    iocache.s[3].handle = p0->vcsm_handle;
+-+    iocache.s[3].cmd = 3; // clean+invalidate
+-+    iocache.s[3].addr = (int) p0->arm;
+-+    iocache.s[3].size  = p0->numbytes;
+-+    if (p1) {
+-+      iocache.s[4].handle = p1->vcsm_handle;
+-+      iocache.s[4].cmd = 3; // clean+invalidate
+-+      iocache.s[4].addr = (int) p1->arm;
+-+      iocache.s[4].size  = p1->numbytes;
+-+    }
+-+    if (p2) {
+-+      iocache.s[5].handle = p2->vcsm_handle;
+-+      iocache.s[5].cmd = 3; // clean+invalidate
+-+      iocache.s[5].addr = (int) p2->arm;
+-+      iocache.s[5].size  = p2->numbytes;
+-+    }
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    flush_buffer(frame->buf[0]);
+-+    flush_buffer(frame->buf[1]);
+-+    flush_buffer(frame->buf[2]);
+-+    gpu_cache_flush3(p0, p1, p2);
+-+#endif
+++    // Do the various passes - common with the worker code
+++    worker_core(s);
+++    // Prepare next batch
+++    rpi_begin(s);
+ +}
+ +
+++
+++
+ +#endif
+ +
+  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+  {
+      HEVCContext *s  = avctxt->priv_data;
+-@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+      int y_ctb       = 0;
+      int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+  
+ +#ifdef RPI
+-+    s->enable_rpi = s->ps.sps->bit_depth == 8
+-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
+++    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
+++        s->frame->format == AV_PIX_FMT_SAND128 &&
+++        !s->ps.pps->cross_component_prediction_enabled_flag;
+ +
+ +    if (!s->enable_rpi) {
+ +      if (s->ps.pps->cross_component_prediction_enabled_flag)
+@@ -4151,7 +5460,7 @@ index b478065..88dd40b 100644
+      if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+          av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+          return AVERROR_INVALIDDATA;
+-@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+          }
+      }
+  
+@@ -4166,26 +5475,25 @@ index b478065..88dd40b 100644
+      while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+          int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+  
+-@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
++         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
++ 
+++
++         ff_hevc_cabac_init(s, ctb_addr_ts);
++ 
++         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
++@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+          s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+  
+-+#ifdef RPI_INTER_QPU
+-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
+++#if RPI_INTER
+++        s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV;
+++        s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y;
+ +#endif
+ +
+          more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+ +
+-+#ifdef RPI_INTER_QPU
+-+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
+-+#endif
+-+
+ +#ifdef RPI
+ +        if (s->enable_rpi) {
+ +          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+@@ -4195,14 +5503,18 @@ index b478065..88dd40b 100644
+ +          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+ +          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+ +          s->ctu_count++;
+-+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
+ +
+ +          if ( s->ctu_count >= s->max_ctu_count ) {
+ +#ifdef RPI_WORKER
+-+            if (s->used_for_ref) {
+++            if (s->used_for_ref)
+++            {
+++//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
+++
+++//                worker_wait(s);
+ +              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+ +              // Pass on this job to worker thread
+ +              worker_submit_job(s);
+++
+ +              // Make sure we have space to prepare the next job
+ +              worker_pass0_ready(s);
+ +
+@@ -4224,7 +5536,7 @@ index b478065..88dd40b 100644
+          if (more_data < 0) {
+              s->tab_slice_address[ctb_addr_rs] = -1;
+              return more_data;
+-@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+  
+          ctb_addr_ts++;
+          ff_hevc_save_states(s, ctb_addr_ts);
+@@ -4249,12 +5561,25 @@ index b478065..88dd40b 100644
+ +        rpi_do_all_passes(s);
+ +    }
+ +
+++#if RPI_TSTATS
+++    {
+++        HEVCRpiStats *const ts = &s->tstats;
+++
+++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
+++        memset(ts, 0, sizeof(*ts));
+++    }
+++#endif
+++
+ +#endif
+ +
+      if (x_ctb + ctb_size >= s->ps.sps->width &&
+          y_ctb + ctb_size >= s->ps.sps->height)
+          ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+-@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
++@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+      s = s1->sList[self_id];
+      lc = s->HEVClc;
+  
+@@ -4266,16 +5591,32 @@ index b478065..88dd40b 100644
+      if(ctb_row) {
+          ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+  
+-@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
++@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+          if (ret < 0)
+              return ret;
+  
+-+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
+++        // The definition of _N unit types is "non-reference for other frames
+++        // with the same temporal_id" so they may/will be ref frames for pics
+++        // with a higher temporal_id.
+++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+++            !(s->nal_unit_type == NAL_TRAIL_N ||
+ +                        s->nal_unit_type == NAL_TSA_N   ||
+ +                        s->nal_unit_type == NAL_STSA_N  ||
+ +                        s->nal_unit_type == NAL_RADL_N  ||
+ +                        s->nal_unit_type == NAL_RASL_N);
+ +
+++#if DEBUG_DECODE_N
+++        {
+++            static int z = 0;
+++            if (IS_IDR(s)) {
+++                z = 1;
+++            }
+++            if (z != 0 && z++ > DEBUG_DECODE_N) {
+++                s->is_decoded = 0;
+++                break;
+++            }
+++        }
+++#endif
+ +        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+ +            s->is_decoded = 0;
+ +            break;
+@@ -4283,27 +5624,30 @@ index b478065..88dd40b 100644
+          if (s->max_ra == INT_MAX) {
+              if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+                  s->max_ra = s->poc;
+-@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++         }
+      }
+  
+- fail:
++-fail:
+ -    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+++fail:  // Also success path
+ +    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+-+#ifdef RPI_INTER_QPU
+-+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
+++#if RPI_INTER
+++        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
+ +#endif
+          ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+ -
+-+    } else if (s->ref) {
+-+#ifdef RPI_INTER_QPU
+++    }
+++#if RPI_INTER
+++    else if (s->ref && s->enable_rpi) {
+ +      // When running single threaded we need to flush the whole frame
+ +      flush_frame(s,s->frame);
+-+#endif
+ +    }
+++#endif
+      return ret;
+  }
+  
+-@@ -3064,6 +4625,41 @@ fail:
++@@ -3064,6 +4764,41 @@ fail:
+      return AVERROR(ENOMEM);
+  }
+  
+@@ -4345,7 +5689,7 @@ index b478065..88dd40b 100644
+  static av_cold int hevc_decode_free(AVCodecContext *avctx)
+  {
+      HEVCContext       *s = avctx->priv_data;
+-@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+  
+      av_freep(&s->cabac_state);
+  
+@@ -4356,29 +5700,26 @@ index b478065..88dd40b 100644
+ +#endif
+ +
+ +    for(i=0;i<RPI_MAX_JOBS;i++) {
+-+      av_freep(&s->unif_mv_cmds[i]);
+-+      av_freep(&s->univ_pred_cmds[i]);
+ +
+-+#ifdef RPI_INTER_QPU
+-+      if (s->unif_mvs[i]) {
+-+        gpu_free( &s->unif_mvs_ptr[i] );
+-+        s->unif_mvs[i] = 0;
+-+      }
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+      if (s->y_unif_mvs[i]) {
+-+        gpu_free( &s->y_unif_mvs_ptr[i] );
+-+        s->y_unif_mvs[i] = 0;
+-+      }
+++        av_freep(&s->unif_mv_cmds_y[i]);
+++        av_freep(&s->unif_mv_cmds_c[i]);
+++        av_freep(&s->univ_pred_cmds[i]);
+++
+++#if RPI_INTER
+++        gpu_free(&s->jobs[i].chroma_mvs_gptr);
+++        gpu_free(&s->jobs[i].luma_mvs_gptr);
+ +#endif
+ +    }
+ +
+++    vpu_qpu_term();
+++
+++    av_rpi_zc_uninit(avctx);
+ +#endif
+ +
+      for (i = 0; i < 3; i++) {
+          av_freep(&s->sao_pixel_buffer_h[i]);
+          av_freep(&s->sao_pixel_buffer_v[i]);
+-@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+      return 0;
+  }
+  
+@@ -4398,75 +5739,76 @@ index b478065..88dd40b 100644
+  {
+      HEVCContext *s = avctx->priv_data;
+      int i;
+-+    int job;
+++#ifdef RPI
+++    unsigned int job;
+++#endif
+  
+      s->avctx = avctx;
+  
+-@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
++@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+      s->HEVClcList[0] = s->HEVClc;
+      s->sList[0] = s;
+  
+ +#ifdef RPI
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
+-+        if (!s->unif_mv_cmds[job])
+++    // Whilst FFmpegs init fn is only called once the close fn is called as
+++    // many times as we have threads (init_thread_copy is called for the
+++    // threads).  So to match init & term put the init here where it will be
+++    // called by both init & copy
+++    av_rpi_zc_init(avctx);
+++
+++    if (vpu_qpu_init() != 0)
+++        goto fail;
+++
+++    for(job = 0; job < RPI_MAX_JOBS; job++) {
+++        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
+++        if (!s->unif_mv_cmds_y[job])
+++            goto fail;
+++        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
+++        if (!s->unif_mv_cmds_c[job])
+ +            goto fail;
+ +        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+ +        if (!s->univ_pred_cmds[job])
+ +            goto fail;
+ +    }
+ +
+-+#ifdef RPI_INTER_QPU
+++#if RPI_INTER
+ +    // We divide the image into blocks 256 wide and 64 high
+ +    // We support up to 2048 widths
+ +    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+ +    // Also add space for the startup command for each stream.
+ +
+-+    {
+-+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
+-+        uint32_t *p;
+-+		for(job=0;job<RPI_MAX_JOBS;job++) {
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+++    for (job = 0; job < RPI_MAX_JOBS; job++) {
+++        HEVCRpiJob * const jb = s->jobs + job;
+++#if RPI_CACHE_UNIF_MVS
+++        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
+++        gpu_malloc_cached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
+ +#else
+-+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
+++        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
+++        gpu_malloc_uncached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
+ +#endif
+-+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
+ +
+-+          // Set up initial locations for uniform streams
+-+          p = s->unif_mvs[job];
+-+          for(i = 0; i < 8; i++) {
+-+            s->mvs_base[job][i] = p;
+-+            p += uv_commands_per_qpu;
+-+          }
+++        {
+++            qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm;
+++            for(i = 0; i < QPU_N_UV; i++) {
+++                jb->chroma_mvs[i].qpu_mc_base = p;
+++                jb->chroma_mvs[i].qpu_mc_curr = p;
+++                p += UV_COMMANDS_PER_QPU;
+++            }
+ +        }
+-+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
+-+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
+-+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
+-+    }
+-+
+-+#endif
+-+#ifdef RPI_LUMA_QPU
+-+    for(job=0;job<RPI_MAX_JOBS;job++)
+-+    {
+-+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
+-+        uint32_t *p;
+-+#ifdef RPI_CACHE_UNIF_MVS
+-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+-+#else
+-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
+-+#endif
+-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
+-+
+-+        // Set up initial locations for uniform streams
+-+        p = s->y_unif_mvs[job];
+-+        for(i = 0; i < 12; i++) {
+-+            s->y_mvs_base[job][i] = p;
+-+            p += y_commands_per_qpu;
+++        {
+++            qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm;
+++            for(i = 0; i < QPU_N_Y; i++) {
+++                jb->luma_mvs[i].qpu_mc_base = p;
+++                jb->luma_mvs[i].qpu_mc_curr = p;
+++                p += Y_COMMANDS_PER_QPU;
+++            }
+ +        }
+ +    }
+-+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
+-+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
+++    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
+++    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
+++    s->qpu_dummy_frame = qpu_fn(mc_setup_c);  // Use our code as a dummy frame
+++    s->qpu_filter = qpu_fn(mc_filter);
+++    s->qpu_filter_b = qpu_fn(mc_filter_b);
+ +#endif
+ +    //gpu_malloc_uncached(2048*64,&s->dummy);
+ +
+@@ -4481,8 +5823,30 @@ index b478065..88dd40b 100644
+      s->cabac_state = av_malloc(HEVC_CONTEXTS);
+      if (!s->cabac_state)
+          goto fail;
++@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
++     }
++ 
++     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++-            s->threads_type = FF_THREAD_FRAME;
++-        else
++-            s->threads_type = FF_THREAD_SLICE;
+++        s->threads_type = FF_THREAD_FRAME;
+++    else
+++        s->threads_type = FF_THREAD_SLICE;
++ 
++     return 0;
++ }
++@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = {
++     .update_thread_context = hevc_update_thread_context,
++     .init_thread_copy      = hevc_init_thread_copy,
++     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+++//                             0,
+++//                             AV_CODEC_CAP_FRAME_THREADS,
++                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
++     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++ };
+ diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index be91010..6b03ea8 100644
++index be91010..dd7d152 100644
+ --- a/libavcodec/hevc.h
+ +++ b/libavcodec/hevc.h
+ @@ -23,6 +23,9 @@
+@@ -4495,37 +5859,53 @@ index be91010..6b03ea8 100644
+  #include "libavutil/buffer.h"
+  #include "libavutil/md5.h"
+  
+-@@ -37,6 +40,29 @@
++@@ -37,6 +40,45 @@
+  #include "thread.h"
+  #include "videodsp.h"
+  
+ +// define RPI to split the CABAC/prediction/transform into separate stages
+-+#ifdef RPI
+++#ifndef RPI
+ +
+-+  #include "rpi_qpu.h"
+-+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
+-+  #define RPI_INTER_QPU
+++  #define RPI_INTER          0
+++  #define RPI_TSTATS         0
+++  #define RPI_HEVC_SAND      0
+ +
+-+  #ifdef RPI_INTER_QPU
+-+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
+-+    #define RPI_LUMA_QPU
+-+  #endif
+++#else
+++
+++  #include "rpi_qpu.h"
+++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
+ +
+-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+-+  #define RPI_MAX_JOBS 2
+ +  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+ +  #define RPI_WORKER
+++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+++  // This has no effect unless RPI_WORKER is defined
+++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
+++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
+++  // free for the foreground to fill in.
+++  #define RPI_MAX_JOBS 2
+++
+ +  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
+++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
+++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
+ +//  #define RPI_DEBLOCK_VPU
+ +
+-+#endif
+++  #define RPI_VPU_DEBLOCK_CACHED 1
+++
+++  #if HAVE_NEON
+++  #define RPI_HEVC_SAND      1
+++  #else
+++  // Sand bust on Pi1 currently - reasons unknown
+++  #define RPI_HEVC_SAND      0
+++  #endif
+ +
+-+#define RPI_VPU_DEBLOCK_CACHED 1
+++  #define RPI_TSTATS 0
+++#endif
+ +
+  #define MAX_DPB_SIZE 16 // A.4.1
+  #define MAX_REFS 16
+  
+-@@ -660,17 +686,6 @@ typedef struct CodingUnit {
++@@ -660,17 +702,6 @@ typedef struct CodingUnit {
+      uint8_t cu_transquant_bypass_flag;
+  } CodingUnit;
+  
+@@ -4543,7 +5923,7 @@ index be91010..6b03ea8 100644
+  typedef struct NeighbourAvailable {
+      int cand_bottom_left;
+      int cand_left;
+-@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
++@@ -747,7 +778,17 @@ typedef struct HEVCFrame {
+      uint8_t flags;
+  } HEVCFrame;
+  
+@@ -4561,7 +5941,7 @@ index be91010..6b03ea8 100644
+      uint8_t cabac_state[HEVC_CONTEXTS];
+  
+      uint8_t stat_coeff[4];
+-@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
++@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext {
+  
+      int qPy_pred;
+  
+@@ -4569,7 +5949,7 @@ index be91010..6b03ea8 100644
+  
+      uint8_t ctb_left_flag;
+      uint8_t ctb_up_flag;
+-@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
++@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext {
+      int ct_depth;
+      CodingUnit cu;
+      PredictionUnit pu;
+@@ -4577,7 +5957,7 @@ index be91010..6b03ea8 100644
+  
+  #define BOUNDARY_LEFT_SLICE     (1 << 0)
+  #define BOUNDARY_LEFT_TILE      (1 << 1)
+-@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
++@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext {
+      int boundary_flags;
+  } HEVCLocalContext;
+  
+@@ -4589,13 +5969,15 @@ index be91010..6b03ea8 100644
+ +// This is a distance of 1536 pixels across the screen
+ +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+ +// but allocate more memory and increase the latency before data in the next frame can be processed
+-+#define RPI_NUM_CHUNKS 1
+++#define RPI_NUM_CHUNKS 4
+++#define RPI_CHUNK_SIZE 12
+ +
+ +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
+++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
+ +
+ +// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+-+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
+++#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
+++#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
+ +// Each block can have an intra prediction and a transform_add command
+ +#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+ +// Worst case is 16x16 CTUs
+@@ -4612,53 +5994,118 @@ index be91010..6b03ea8 100644
+ +
+ +// Command for inter prediction
+ +typedef struct HEVCMvCmd {
+-+    int cmd;
+-+    uint8_t *dst;
+-+    ptrdiff_t dststride;
+++    uint8_t cmd;
+++    uint8_t block_w;
+++    uint8_t block_h;
+++    int8_t ref_idx[2];
+++    uint16_t dststride;
+++    uint16_t srcstride;
+++    uint16_t srcstride1;
+++    int16_t weight;
+++    int16_t offset;
+++    int16_t x_off;
+++    int16_t y_off;
+ +    uint8_t *src;
+-+    ptrdiff_t srcstride;
+-+    Mv mv;
+-+    int x_off;
+-+    int y_off;
+-+    int block_w;
+-+    int block_h;
+-+    int weight;
+-+    int offset;
+ +    uint8_t *src1;
+-+    ptrdiff_t srcstride1;
+++    uint8_t *dst;
+++    Mv mv;
+ +    Mv mv1;
+-+    int8_t ref_idx[2];
+ +} HEVCMvCmd;
+ +
+ +
+ +// Command for intra prediction and transform_add of predictions to coefficients
+-+#define RPI_PRED_TRANSFORM_ADD 0
+-+#define RPI_PRED_INTRA 1
+++enum rpi_pred_cmd_e
+++{
+++    RPI_PRED_ADD_RESIDUAL,
+++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+++    RPI_PRED_INTRA,
+++    RPI_PRED_I_PCM,
+++    RPI_PRED_CMD_MAX
+++};
+++
+ +typedef struct HEVCPredCmd {
+-+    uint8_t size;
+ +    uint8_t type;
+-+    uint8_t na;
+-+    uint8_t c_idx;
+-+    union {
+-+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t x;   // RPI_PRED_INTRA
+-+    };
+-+    union {
+-+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t y;   // RPI_PRED_INTRA
+-+    };
+++    uint8_t size;  // log2 "size" used by all variants
+++    uint8_t na;    // i_pred - but left here as they pack well
+++    uint8_t c_idx; // i_pred
+ +    union {
+-+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
+-+        uint32_t stride;         // RPI_PRED_INTRA
+++        struct {  // TRANSFORM_ADD
+++            uint8_t * dst;
+++            const int16_t * buf;
+++            uint32_t stride;
+++        } ta;
+++        struct {  // INTRA
+++            uint16_t x;
+++            uint16_t y;
+++            enum IntraPredMode mode;
+++        } i_pred;
+++        struct {  // I_PCM
+++            uint16_t x;
+++            uint16_t y;
+++            const void * src;
+++            uint32_t src_len;
+++        } i_pcm;
+ +    };
+ +} HEVCPredCmd;
+ +
+ +#endif
+ +
+++#ifdef RPI
+++
+++struct qpu_mc_pred_c_s;
+++struct qpu_mc_pred_y_s;
+++
+++typedef struct HEVCRpiLumaPred
+++{
+++    struct qpu_mc_pred_y_s *qpu_mc_base;
+++    struct qpu_mc_pred_y_s *qpu_mc_curr;
+++    struct qpu_mc_pred_y_s *last_lx;
+++    unsigned int load;
+++} HEVCRpiLumaPred;
+++
+++typedef struct HEVCRpiChromaPred
+++{
+++    struct qpu_mc_pred_c_s *qpu_mc_base;
+++    struct qpu_mc_pred_c_s *qpu_mc_curr;
+++    struct qpu_mc_pred_c_s *last_l0;
+++    struct qpu_mc_pred_c_s *last_l1;
+++    unsigned int load;
+++} HEVCRpiChromaPred;
+++
+++typedef struct HEVCRpiJob {
+++    GPU_MEM_PTR_T chroma_mvs_gptr;
+++    GPU_MEM_PTR_T luma_mvs_gptr;
+++    HEVCRpiChromaPred chroma_mvs[QPU_N_UV];
+++    HEVCRpiLumaPred luma_mvs[QPU_N_Y];
+++} HEVCRpiJob;
+++
+++#if RPI_TSTATS
+++typedef struct HEVCRpiStats {
+++    int y_pred1_y8_merge;
+++    int y_pred1_xy;
+++    int y_pred1_x0;
+++    int y_pred1_y0;
+++    int y_pred1_x0y0;
+++    int y_pred1_wle8;
+++    int y_pred1_wgt8;
+++    int y_pred1_hle16;
+++    int y_pred1_hgt16;
+++    int y_pred2_xy;
+++    int y_pred2_x0;
+++    int y_pred2_y0;
+++    int y_pred2_x0y0;
+++    int y_pred2_hle16;
+++    int y_pred2_hgt16;
+++} HEVCRpiStats;
+++#endif
+++
+++#endif
+++
+  typedef struct HEVCContext {
+      const AVClass *c;  // needed by private avoptions
+      AVCodecContext *avctx;
+-@@ -798,13 +895,107 @@ typedef struct HEVCContext {
++@@ -798,13 +978,103 @@ typedef struct HEVCContext {
+  
+      HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+      HEVCLocalContext    *HEVClc;
+@@ -4676,7 +6123,8 @@ index be91010..6b03ea8 100644
+ +
+ +#ifdef RPI
+ +    int enable_rpi;
+-+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
+++    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
+++    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
+ +    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+ +    int buf_width;
+ +    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+@@ -4685,7 +6133,8 @@ index be91010..6b03ea8 100644
+ +    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+ +    int num_coeffs[RPI_MAX_JOBS][4];
+ +    int num_xfm_cmds[RPI_MAX_JOBS];
+-+    int num_mv_cmds[RPI_MAX_JOBS];
+++    int num_mv_cmds_y[RPI_MAX_JOBS];
+++    int num_mv_cmds_c[RPI_MAX_JOBS];
+ +    int num_pred_cmds[RPI_MAX_JOBS];
+ +    int num_dblk_cmds[RPI_MAX_JOBS];
+ +    int vpu_id;
+@@ -4695,29 +6144,23 @@ index be91010..6b03ea8 100644
+ +    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+ +    int ctu_per_y_chan; // Number of CTUs per luma QPU
+ +    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+-+#ifdef RPI_INTER_QPU
+-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
+-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-+
+-+    // _base pointers are to the start of the row
+-+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
+-+    // these pointers are to the next free space
+-+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
+-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
+-+    // Function pointers
+-+    uint32_t mc_filter_uv;
+-+    uint32_t mc_filter_uv_b0;
+-+    uint32_t mc_filter_uv_b;
+++
+++    HEVCRpiJob jobs[RPI_MAX_JOBS];
+++#if RPI_TSTATS
+++    HEVCRpiStats tstats;
+ +#endif
+-+#ifdef RPI_LUMA_QPU
+-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
+-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
+-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
+-+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
+-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
+++#if RPI_INTER
+++    HEVCRpiChromaPred * curr_pred_c;
+++    HEVCRpiLumaPred * curr_pred_y;
+++    struct qpu_mc_pred_y_s * last_y8_p;
+++    struct qpu_mc_pred_y_s * last_y8_lx;
+++
+ +    // Function pointers
+-+    uint32_t mc_filter;
+-+    uint32_t mc_filter_b;
+++    uint32_t qpu_filter_uv;
+++    uint32_t qpu_filter_uv_b0;
+++    uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory
+++    uint32_t qpu_filter;
+++    uint32_t qpu_filter_b;
+ +#endif
+ +
+ +#ifdef RPI_WORKER
+@@ -4754,7 +6197,7 @@ index be91010..6b03ea8 100644
+ +        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+ +        int vpu_cmds_vc;
+ +
+-+        int cmd_id;
+++        vpu_qpu_wait_h cmd_id;
+ +    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+ +
+ +    struct dblk_vpu_q_s * dvq;
+@@ -4767,7 +6210,7 @@ index be91010..6b03ea8 100644
+      uint8_t *cabac_state;
+  
+      /** 1 if the independent slice segment header was successfully parsed */
+-@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
++@@ -922,6 +1192,9 @@ typedef struct HEVCContext {
+      uint32_t max_mastering_luminance;
+      uint32_t min_mastering_luminance;
+  
+@@ -4777,22 +6220,38 @@ index be91010..6b03ea8 100644
+  } HEVCContext;
+  
+  int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
+-@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                   int log2_trafo_size, enum ScanType scan_idx,
+                                   int c_idx);
+  
+-+#ifdef RPI_INTER_QPU
+-+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
+++#if RPI_INTER
+++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
+ +#endif
+ +
+  void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
+  
+  
++@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
++ extern const uint8_t ff_hevc_diag_scan8x8_x[64];
++ extern const uint8_t ff_hevc_diag_scan8x8_y[64];
++ 
+++#ifdef RPI
+++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
+++
+++// arm/hevc_misc_neon.S
+++// Neon coeff zap fn
+++#if HAVE_NEON
+++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+++#endif
+++
+++#endif
+++
++ #endif /* AVCODEC_HEVC_H */
+ diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 05b2821..e2f1f4e 100644
++index 05b2821..733efde 100644
+ --- a/libavcodec/hevc_cabac.c
+ +++ b/libavcodec/hevc_cabac.c
+-@@ -21,14 +21,72 @@
++@@ -21,14 +21,76 @@
+   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+   */
+  
+@@ -4805,6 +6264,10 @@ index 05b2821..e2f1f4e 100644
+  #include "hevc.h"
+ +#include "cabac_functions.h"
+ +
+++#ifdef RPI
+++#include "rpi_zc.h"
+++#endif
+++
+ +// BY22 is probably faster than simple bypass if the processor has
+ +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+ +// x86 has fast int divide
+@@ -4866,7 +6329,7 @@ index 05b2821..e2f1f4e 100644
+  /**
+   * number of bin by SyntaxElement.
+   */
+-@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
++@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
+      { 28, 36, 43, 49, 54, 58, 61, 63, },
+  };
+  
+@@ -5078,7 +6541,7 @@ index 05b2821..e2f1f4e 100644
+  void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
+  {
+      if (s->ps.pps->entropy_coding_sync_enabled_flag &&
+-@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
++@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+      return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
+  }
+  
+@@ -5104,7 +6567,7 @@ index 05b2821..e2f1f4e 100644
+  }
+  
+  int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
+-@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
++@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+      return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
+  }
+  
+@@ -5121,7 +6584,7 @@ index 05b2821..e2f1f4e 100644
+          ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+          ctx_shift = (log2_size + 1) >> 2;
+      } else {
+-@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
++@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
+      return value;
+  }
+  
+@@ -5147,7 +6610,7 @@ index 05b2821..e2f1f4e 100644
+  {
+      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+  }
+-@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
++@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
+  }
+  
+@@ -5160,7 +6623,7 @@ index 05b2821..e2f1f4e 100644
+ +
+ +#ifndef coeff_abs_level_remaining_decode_bypass
+ +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
+- {
+++{
+ +    CABACContext * const c = &s->HEVClc->cc;
+ +    uint32_t y;
+ +    unsigned int prefix;
+@@ -5201,7 +6664,7 @@ index 05b2821..e2f1f4e 100644
+ +#endif
+ +
+ +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+-+{
++ {
+ +    CABACContext * const c = &s->HEVClc->cc;
+      int prefix = 0;
+      int suffix = 0;
+@@ -5347,7 +6810,7 @@ index 05b2821..e2f1f4e 100644
+ +static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+ +{
+ +    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+- }
+++}
+ +#endif
+ +
+ +
+@@ -5442,6 +6905,45 @@ index 05b2821..e2f1f4e 100644
+ +    return i;
+ +}
+ +
+++#ifdef RPI
+++static void rpi_add_residual(HEVCContext * const s,
+++    const unsigned int log2_trafo_size, const unsigned int c_idx,
+++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+++{
+++    const AVFrame * const frame = s->frame;
+++    unsigned int stride = frame->linesize[c_idx];
+++    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+++    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+++    const int is_sliced = rpi_sliced_frame(frame);
+++    uint8_t * dst = !is_sliced ?
+++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+++        c_idx == 0 ?
+++            rpi_sliced_frame_pos_y(frame, x, y) :
+++            rpi_sliced_frame_pos_c(frame, x, y);
+++
+++//    if (c_idx != 0) {
+++//        return;
+++//    }
+++    if (s->enable_rpi) {
+++        HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+++        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+++        cmd->size = log2_trafo_size;
+++        cmd->c_idx = c_idx;
+++        cmd->ta.buf = coeffs;
+++        cmd->ta.dst = dst;
+++        cmd->ta.stride = stride;
+++    }
+++    else if (!is_sliced || c_idx == 0) {
+++        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    }
+++    else if (c_idx == 1) {
+++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    }
+++    else {
+++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    }
++ }
+++#endif
+  
+  void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                                  int log2_trafo_size, enum ScanType scan_idx,
+@@ -5471,17 +6973,20 @@ index 05b2821..e2f1f4e 100644
+ +    const uint8_t *scan_x_cg, *scan_y_cg;
+ +    const xy_off_t * scan_xy_off;
+  
+++#ifndef RPI
+      ptrdiff_t stride = s->frame->linesize[c_idx];
+      int hshift = s->ps.sps->hshift[c_idx];
+      int vshift = s->ps.sps->vshift[c_idx];
+-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++-    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+++    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+                                            ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++-    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++#endif
+ +#ifdef RPI
+-+    //***** transform_skip_flag decoded later!
+-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
+++    int use_vpu;
+ +#endif
+-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+--    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++    int16_t *coeffs;
+ +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
+      int explicit_rdpcm_flag = 0;
+      int explicit_rdpcm_dir_flag;
+@@ -5496,39 +7001,12 @@ index 05b2821..e2f1f4e 100644
+      int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+                                           lc->tu.intra_pred_mode_c;
+  
++-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+ +    int prev_sig = 0;
+ +    const int c_idx_nz = (c_idx != 0);
+ +
+ +    int may_hide_sign;
+ +
+-+#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        int n = trafo_size * trafo_size;
+-+        if (use_vpu) {
+-+            // We support size 4 and size 5.
+-+            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
+-+            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
+-+            // num_coeffs is indexed by log2_trafo_size-2
+-+            if (log2_trafo_size == 4)
+-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
+-+            else
+-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
+-+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
+-+        } else {
+-+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
+-+            s->num_coeffs[s->pass0_job][0] += n;
+-+        }
+-+    }
+-+    // We now do the memset after transform_add while we know the data is cached.
+-+    #ifdef RPI_PRECLEAR
+-+    #else
+-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+    #endif
+-+#else
+-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+#endif
+-+
+-+
+  
+      // Derive QP for dequant
+      if (!lc->cu.cu_transquant_bypass_flag) {
+@@ -5537,7 +7015,7 @@ index 05b2821..e2f1f4e 100644
+          static const uint8_t rem6[51 + 4 * 6 + 1] = {
+              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+-@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+          };
+          int qp_y = lc->qp_y;
+  
+@@ -5558,7 +7036,7 @@ index 05b2821..e2f1f4e 100644
+          }
+  
+          if (c_idx == 0) {
+-@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+              qp += s->ps.sps->qp_bd_offset;
+          }
+  
+@@ -5629,6 +7107,9 @@ index 05b2821..e2f1f4e 100644
+ +        may_hide_sign = 0;
+      }
+  
+++
+++
+++
+      if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+ -        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+ -        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+@@ -5646,7 +7127,7 @@ index 05b2821..e2f1f4e 100644
+                                             &last_significant_coeff_x, &last_significant_coeff_y);
+  
+      if (last_significant_coeff_x > 3) {
+-@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+          int last_x_c = last_significant_coeff_x & 3;
+          int last_y_c = last_significant_coeff_y & 3;
+  
+@@ -5703,27 +7184,53 @@ index 05b2821..e2f1f4e 100644
+ -    for (i = num_last_subset; i >= 0; i--) {
+ -        int n, m;
+ -        int x_cg, y_cg, x_c, y_c, pos;
+--        int implicit_non_zero_coeff = 0;
+--        int64_t trans_coeff_level;
+--        int prev_sig = 0;
+--        int offset = i << 4;
+--        int rice_init = 0;
+ +    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+- 
+--        uint8_t significant_coeff_flag_idx[16];
+--        uint8_t nb_significant_coeff_flag = 0;
+--
+--        x_cg = scan_x_cg[i];
+--        y_cg = scan_y_cg[i];
+--
+--        if ((i < num_last_subset) && (i > 0)) {
+--            int ctx_cg = 0;
+--            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+--                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+--            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+--                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+++
+ +    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+++
+++    {
+++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
+++#ifdef RPI
+++        use_vpu = 0;
+++        if (s->enable_rpi) {
+++            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
+++            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+++#if HAVE_NEON
+++            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+++#else
+++            memset(coeffs, 0, ccount * sizeof(int16_t));
+++#endif
+++        }
+++        else
+++#endif
+++        {
+++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+++            memset(coeffs, 0, ccount * sizeof(int16_t));
+++        }
+++    }
+++
+++    i = num_last_subset;
+++    do {
++         int implicit_non_zero_coeff = 0;
++-        int64_t trans_coeff_level;
++-        int prev_sig = 0;
++-        int offset = i << 4;
++-        int rice_init = 0;
+++        int n_end;
+  
++         uint8_t significant_coeff_flag_idx[16];
++-        uint8_t nb_significant_coeff_flag = 0;
++-
++-        x_cg = scan_x_cg[i];
++-        y_cg = scan_y_cg[i];
++-
++-        if ((i < num_last_subset) && (i > 0)) {
++-            int ctx_cg = 0;
++-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
++-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
++-
+ -            significant_coeff_group_flag[x_cg][y_cg] =
+ -                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+ -            implicit_non_zero_coeff = 1;
+@@ -5732,13 +7239,8 @@ index 05b2821..e2f1f4e 100644
+ -            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+ -             (x_cg == 0 && y_cg == 0));
+ -        }
+-+    i = num_last_subset;
+-+    do {
+-+        int implicit_non_zero_coeff = 0;
+-+        int n_end;
+- 
++-
+ -        last_scan_pos = num_coeff - offset - 1;
+-+        uint8_t significant_coeff_flag_idx[16];
+ +        unsigned int nb_significant_coeff_flag = 0;
+  
+          if (i == num_last_subset) {
+@@ -5824,7 +7326,7 @@ index 05b2821..e2f1f4e 100644
+                          if (log2_trafo_size == 3) {
+                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+                          } else {
+-@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                      }
+                  }
+              }
+@@ -5873,12 +7375,11 @@ index 05b2821..e2f1f4e 100644
+                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+                      nb_significant_coeff_flag++;
+                  }
+-@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+              }
+          }
+  
+ -        n_end = nb_significant_coeff_flag;
+--
+ +        if (nb_significant_coeff_flag != 0) {
+ +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+ +                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+@@ -5926,6 +7427,9 @@ index 05b2821..e2f1f4e 100644
+ +                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+ +                }
+  
+++                // Probably not worth the overhead of starting by22 for just one value
+++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
++ 
+ -        if (n_end) {
+ -            int first_nz_pos_in_cg;
+ -            int last_nz_pos_in_cg;
+@@ -5936,9 +7440,6 @@ index 05b2821..e2f1f4e 100644
+ -            int sum_abs = 0;
+ -            int sign_hidden;
+ -            int sb_type;
+-+                // Probably not worth the overhead of starting by22 for just one value
+-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+- 
+ +                if (coded_val)
+ +                {
+ +                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+@@ -5949,13 +7450,18 @@ index 05b2821..e2f1f4e 100644
+ +                        const unsigned int c_rice_param = *stat_coeff >> 2;
+ +                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+  
+--            // initialize first elem of coeff_bas_level_greater1_flag
+--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+ +                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+ +                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+ +                    }
+ +                }
+  
++-            // initialize first elem of coeff_bas_level_greater1_flag
++-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+++                {
+++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+++                    const unsigned int scale_m = blk_scale[xy_off->scale];
++ 
+ -            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+ -                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+ -                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
+@@ -5963,11 +7469,7 @@ index 05b2821..e2f1f4e 100644
+ -                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+ -                c_rice_param = lc->stat_coeff[sb_type] / 4;
+ -            }
+-+                {
+-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
+- 
++-
+ -            if (!(i == num_last_subset) && greater1_ctx == 0)
+ -                ctx_set++;
+ -            greater1_ctx = 1;
+@@ -6052,10 +7554,6 @@ index 05b2821..e2f1f4e 100644
+ +
+ +                            sum_abs += last_coeff_abs_level_remaining + 1;
+ +                            *level = trans_coeff_level;
+-+
+-+                            if (stat_coeff != NULL)
+-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+-+                            stat_coeff = NULL;
+  
+ -            for (m = 0; m < n_end; m++) {
+ -                n = significant_coeff_flag_idx[m];
+@@ -6076,6 +7574,10 @@ index 05b2821..e2f1f4e 100644
+ -                                if (lc->stat_coeff[sb_type] > 0)
+ -                                    lc->stat_coeff[sb_type]--;
+ -                            rice_init = 1;
+++                            if (stat_coeff != NULL)
+++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                            stat_coeff = NULL;
+++
+ +                            if (trans_coeff_level > (3 << c_rice_param) &&
+ +                                (c_rice_param < 4 || rice_adaptation_enabled))
+ +                                ++c_rice_param;
+@@ -6176,7 +7678,7 @@ index 05b2821..e2f1f4e 100644
+  
+      if (lc->cu.cu_transquant_bypass_flag) {
+          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+-@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+          }
+      } else {
+@@ -6185,7 +7687,7 @@ index 05b2821..e2f1f4e 100644
+              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+                        log2_trafo_size == 2 &&
+                        lc->cu.pred_mode == MODE_INTRA;
+-@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                  for (i = 0; i < 8; i++)
+                      FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+              }
+@@ -6193,7 +7695,7 @@ index 05b2821..e2f1f4e 100644
+              s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
+  
+              if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+-@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                  s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+              }
+          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+@@ -6221,7 +7723,7 @@ index 05b2821..e2f1f4e 100644
+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+              if (max_xy == 0)
+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+                      col_limit = FFMIN(24, col_limit);
+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+              }
+@@ -6229,26 +7731,20 @@ index 05b2821..e2f1f4e 100644
+          }
+      }
+      if (lc->tu.cross_pf) {
+-@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+          }
+      }
+ +#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-+        cmd->type = RPI_PRED_TRANSFORM_ADD;
+-+        cmd->size = log2_trafo_size;
+-+        cmd->buf = coeffs;
+-+        cmd->dst = dst;
+-+        cmd->stride = stride;
+-+        return;
+-+    }
+-+#endif
+++    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+++#else
+      s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+++#endif
+  }
+  
++ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+ diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 1f33b0c..55a0315 100644
++index 1f33b0c..3143b4f 100644
+ --- a/libavcodec/hevc_filter.c
+ +++ b/libavcodec/hevc_filter.c
+ @@ -22,6 +22,12 @@
+@@ -6269,14 +7765,78 @@ index 1f33b0c..55a0315 100644
+  #include "bit_depth_template.c"
+  
+ +#ifdef RPI
+-+#include "rpi_user_vcsm.h"
+ +#include "rpi_qpu.h"
+++#include "rpi_zc.h"
+ +#endif
+ +
+  #define LUMA 0
+  #define CB 1
+  #define CR 2
+-@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
++     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
++ }
++ 
+++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
+++{
+++#ifdef RPI
+++    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
+++#else
+++    return s->ps.sps->pixel_shift;
+++#endif
+++}
+++
++ static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
++                      intptr_t stride_dst, intptr_t stride_src)
++ {
++@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
++                            int stride_src, int x, int y, int width, int height,
++                            int c_idx, int x_ctb, int y_ctb)
++ {
++-    int sh = s->ps.sps->pixel_shift;
+++    const unsigned int sh = pixel_shift(s, c_idx);
++     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
++     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
++ 
++@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s,
++         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
++         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
++         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
++-        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+++        const unsigned int sh = pixel_shift(s, c_idx);
+++        int len          = (min_pu_size >> hshift) << sh;
++         for (y = y_min; y < y_max; y++) {
++             for (x = x_min; x < x_max; x++) {
++                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
++                     int n;
++-                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
++-                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+++                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+++                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
++                     for (n = 0; n < (min_pu_size >> vshift); n++) {
++                         memcpy(src, dst, len);
++                         src += stride_src;
++@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s,
++ 
++ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++ {
++-    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++     HEVCLocalContext *lc = s->HEVClc;
++     int c_idx;
++     int edges[4];  // 0 left 1 top 2 right 3 bottom
++@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++     uint8_t right_tile_edge  = 0;
++     uint8_t up_tile_edge     = 0;
++     uint8_t bottom_tile_edge = 0;
+++#ifdef RPI
+++    const int sliced = rpi_sliced_frame(s->frame);
+++    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
+++#else
+++    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
+++#endif
++ 
++     edges[0]   = x_ctb == 0;
++     edges[1]   = y_ctb == 0;
+      edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+      edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
+  
+@@ -6287,7 +7847,301 @@ index 1f33b0c..55a0315 100644
+      if (restore) {
+          if (!edges[0]) {
+              left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+-@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         }
++     }
++ 
++-    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
++         int x0       = x >> s->ps.sps->hshift[c_idx];
++         int y0       = y >> s->ps.sps->vshift[c_idx];
++         int stride_src = s->frame->linesize[c_idx];
++@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
++         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
++         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++-        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
++-        int stride_dst;
+++        ptrdiff_t stride_dst;
++         uint8_t *dst;
++ 
+++#ifdef RPI
+++        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
+++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+++        uint8_t * const src = !sliced ?
+++                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
+++            c_idx == 0 ?
+++                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
+++                rpi_sliced_frame_pos_c(s->frame, x0, y0);
+++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+++            !sliced ? src - (1 << sh) :
+++            c_idx == 0 ?
+++                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
+++                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
+++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+++            !sliced ? src + (width << sh) :
+++            c_idx == 0 ?
+++                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
+++                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
+++
+++
+++        if (sliced && c_idx > 1) {
+++            break;
+++        }
+++#else
+++        const unsigned int sh = s->ps.sps->pixel_shift;
+++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+++        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
+++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
+++#endif
+++
++         switch (sao->type_idx[c_idx]) {
++         case SAO_BAND:
++             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                            x_ctb, y_ctb);
++             if (s->ps.pps->transquant_bypass_enable_flag ||
++                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
++-            dst = lc->edge_emu_buffer;
++-            stride_dst = 2*MAX_PB_SIZE;
++-            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
++-            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
++-                                            width, height);
++-            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++-                               x, y, width, height, c_idx);
+++                dst = lc->edge_emu_buffer;
+++                stride_dst = 2*MAX_PB_SIZE;
+++                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
+++#ifdef RPI
+++                if (sliced && c_idx != 0)
+++                {
+++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+++                                                    sao->offset_val[1], sao->band_position[1],
+++                                                    sao->offset_val[2], sao->band_position[2],
+++                                                    width, height);
+++                }
+++                else
+++#endif
+++                {
+++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+++                                                    width, height);
+++                }
+++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+++                                   x, y, width, height, c_idx);
++             } else {
++-            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
++-                                            width, height);
+++#ifdef RPI
+++                if (sliced && c_idx != 0)
+++                {
+++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+++                                                    sao->offset_val[1], sao->band_position[1],
+++                                                    sao->offset_val[2], sao->band_position[2],
+++                                                    width, height);
+++                }
+++                else
+++#endif
+++                {
+++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+++                                                    width, height);
+++                }
++             }
++             sao->type_idx[c_idx] = SAO_APPLIED;
++             break;
++@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         {
++             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
++             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
++-            int left_edge = edges[0];
++             int top_edge = edges[1];
++-            int right_edge = edges[2];
++             int bottom_edge = edges[3];
++-            int sh = s->ps.sps->pixel_shift;
++-            int left_pixels, right_pixels;
++ 
++             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
++             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
++ 
++             if (!top_edge) {
++-                int left = 1 - left_edge;
++-                int right = 1 - right_edge;
++-                const uint8_t *src1[2];
++                 uint8_t *dst1;
++-                int src_idx, pos;
+++                int src_idx;
+++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
++ 
++-                dst1 = dst - stride_dst - (left << sh);
++-                src1[0] = src - stride_src - (left << sh);
++-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
++-                pos = 0;
++-                if (left) {
+++                dst1 = dst - stride_dst;
+++
+++                if (src_l != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1, src1[src_idx], sh);
++-                    pos += (1 << sh);
+++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
++                 }
+++
++                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++                            SAO_APPLIED);
++-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
++-                if (right) {
++-                    pos += width << sh;
+++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+++
+++                if (src_r != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
++                 }
++             }
++             if (!bottom_edge) {
++-                int left = 1 - left_edge;
++-                int right = 1 - right_edge;
++-                const uint8_t *src1[2];
++-                uint8_t *dst1;
++-                int src_idx, pos;
+++                uint8_t * const dst1 = dst + height * stride_dst;
+++                int src_idx;
+++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+++                const unsigned int hoff = height * stride_src;
++ 
++-                dst1 = dst + height * stride_dst - (left << sh);
++-                src1[0] = src + height * stride_src - (left << sh);
++-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
++-                pos = 0;
++-                if (left) {
+++                if (src_l != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1, src1[src_idx], sh);
++-                    pos += (1 << sh);
+++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
++                 }
+++
++                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++                            SAO_APPLIED);
++-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
++-                if (right) {
++-                    pos += width << sh;
+++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+++
+++                if (src_r != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
++                 }
++             }
++-            left_pixels = 0;
++-            if (!left_edge) {
+++            if (src_l != NULL) {
++                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                     copy_vert(dst - (1 << sh),
++                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++                               sh, height, stride_dst, 1 << sh);
++                 } else {
++-                    left_pixels = 1;
+++                    copy_vert(dst - (1 << sh),
+++                              src_l,
+++                              sh, height, stride_dst, stride_src);
++                 }
++             }
++-            right_pixels = 0;
++-            if (!right_edge) {
+++            if (src_r != NULL) {
++                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                     copy_vert(dst + (width << sh),
++                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++                               sh, height, stride_dst, 1 << sh);
++                 } else {
++-                    right_pixels = 1;
+++                    copy_vert(dst + (width << sh),
+++                              src_r,
+++                              sh, height, stride_dst, stride_src);
++                 }
++             }
++ 
++-            copy_CTB(dst - (left_pixels << sh),
++-                     src - (left_pixels << sh),
++-                     (width + left_pixels + right_pixels) << sh,
+++            copy_CTB(dst,
+++                     src,
+++                     width << sh,
++                      height, stride_dst, stride_src);
++ 
++             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                            x_ctb, y_ctb);
++-            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++-                                            sao->eo_class[c_idx], width, height);
++-            s->hevcdsp.sao_edge_restore[restore](src, dst,
++-                                                stride_src, stride_dst,
++-                                                sao,
++-                                                edges, width,
++-                                                height, c_idx,
++-                                                vert_edge,
++-                                                horiz_edge,
++-                                                diag_edge);
+++#ifdef RPI
+++            if (sliced && c_idx != 0)
+++            {
+++                // Class always the same for both U & V (which is just as well :-))
+++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+++                                                width, height);
+++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+++                                                    stride_src, stride_dst,
+++                                                    sao,
+++                                                    edges, width,
+++                                                    height, c_idx,
+++                                                    vert_edge,
+++                                                    horiz_edge,
+++                                                    diag_edge);
+++            }
+++            else
+++#endif
+++            {
+++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+++                                                sao->eo_class[c_idx], width, height);
+++                s->hevcdsp.sao_edge_restore[restore](src, dst,
+++                                                    stride_src, stride_dst,
+++                                                    sao,
+++                                                    edges, width,
+++                                                    height, c_idx,
+++                                                    vert_edge,
+++                                                    horiz_edge,
+++                                                    diag_edge);
+++            }
++             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                x, y, width, height, c_idx);
++             sao->type_idx[c_idx] = SAO_APPLIED;
++@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++     }
++ }
++ 
+++// Returns 2 or 0.
++ static int get_pcm(HEVCContext *s, int x, int y)
++ {
++     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
++@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     uint8_t *src;
++     int x, y;
++     int chroma, beta;
++-    int32_t c_tc[2], tc[2];
+++    int32_t c_tc[4], tc[2];
++     uint8_t no_p[2] = { 0 };
++     uint8_t no_q[2] = { 0 };
++ 
++@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                  s->ps.sps->pcm.loop_filter_disable_flag) ||
+                 s->ps.pps->transquant_bypass_enable_flag;
+  
+@@ -6303,27 +8157,81 @@ index 1f33b0c..55a0315 100644
+      if (x0) {
+          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+          left_beta_offset = s->deblock[ctb - 1].beta_offset;
+-@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                          s->frame->linesize[LUMA],
+-                                                          beta, tc, no_p, no_q);
+-                 } else
++@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                 if (pcmf) {
++                     no_p[0] = get_pcm(s, x - 1, y);
++                     no_p[1] = get_pcm(s, x - 1, y + 4);
++                     no_q[0] = get_pcm(s, x, y);
++                     no_q[1] = get_pcm(s, x, y + 4);
++-                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
++-                                                         s->frame->linesize[LUMA],
++-                                                         beta, tc, no_p, no_q);
++-                } else
++-                    s->hevcdsp.hevc_v_loop_filter_luma(src,
++-                                                       s->frame->linesize[LUMA],
++-                                                       beta, tc, no_p, no_q);
+++                }
+++#ifdef RPI
+++                if (rpi_sliced_frame(s->frame)) {
+++
+++                    // This copes properly with no_p/no_q
+++                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
+++                                                     s->frame->linesize[LUMA],
+++                                                     beta, tc, no_p, no_q,
+++                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
+++                }
+++                else
+++#endif
+++                {
+++                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+++                    if (pcmf) {
+++                        // Standard DSP code is broken if no_p / no_q is set
+++                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+++                                                           s->frame->linesize[LUMA],
+++                                                           beta, tc, no_p, no_q);
+++                    }
+++                    else
+ +#ifdef RPI_DEBLOCK_VPU
+-+                if (s->enable_rpi_deblock) {
+-+                    uint8_t (*setup)[2][2][4];
+-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                    int a = ((y>>3) & 1) << 1;
+-+                    int b = (x>>3) & 1;
+-+                    setup = s->dvq->y_setup_arm[num16];
+-+                    setup[0][b][0][a] = beta;
+-+                    setup[0][b][0][a + 1] = beta;
+-+                    setup[0][b][1][a] = tc[0];
+-+                    setup[0][b][1][a + 1] = tc[1];
+-+                } else
+++                    if (s->enable_rpi_deblock) {
+++                        uint8_t (*setup)[2][2][4];
+++                        int num16 = (y>>4)*s->setup_width + (x>>4);
+++                        int a = ((y>>3) & 1) << 1;
+++                        int b = (x>>3) & 1;
+++                        setup = s->dvq->y_setup_arm[num16];
+++                        setup[0][b][0][a] = beta;
+++                        setup[0][b][0][a + 1] = beta;
+++                        setup[0][b][1][a] = tc[0];
+++                        setup[0][b][1][a + 1] = tc[1];
+++                    } else
+ +#endif
+-                     s->hevcdsp.hevc_v_loop_filter_luma(src,
+-                                                        s->frame->linesize[LUMA],
+-                                                        beta, tc, no_p, no_q);
+-@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+++                    {
+++                        s->hevcdsp.hevc_v_loop_filter_luma(src,
+++                                                           s->frame->linesize[LUMA],
+++                                                           beta, tc, no_p, no_q);
+++                    }
+++                }
++             }
++         }
++ 
++@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
++                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+++                src =
+++#ifdef RPI
+++                    rpi_sliced_frame(s->frame) ?
+++                        rpi_sliced_frame_pos_y(s->frame, x, y) :
+++#endif
+++                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                 if (pcmf) {
++                     no_p[0] = get_pcm(s, x, y - 1);
++                     no_p[1] = get_pcm(s, x + 4, y - 1);
++@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                                                           s->frame->linesize[LUMA],
+                                                           beta, tc, no_p, no_q);
+                  } else
+@@ -6343,7 +8251,113 @@ index 1f33b0c..55a0315 100644
+                      s->hevcdsp.hevc_h_loop_filter_luma(src,
+                                                         s->frame->linesize[LUMA],
+                                                         beta, tc, no_p, no_q);
+-@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     }
++ 
++     if (s->ps.sps->chroma_format_idc) {
+++#ifdef RPI
+++        if (rpi_sliced_frame(s->frame)) {
+++            const int v = 2;
+++            const int h = 2;
+++
+++            // vertical filtering chroma
+++            for (y = y0; y < y_end; y += 8 * v) {
+++                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
+++                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
+++                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
+++
+++                    if ((bs0 == 2) || (bs1 == 2)) {
+++                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
+++                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
+++                        unsigned int no_f = 0;
+++
+++                        // tc_offset here should be set to cur_tc_offset I think
+++                        const uint32_t tc4 =
+++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
+++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++
+++                        if (tc4 == 0)
+++                            continue;
+++
+++                        if (pcmf) {
+++                            no_f =
+++                                (get_pcm(s, x - 1, y) ? 1 : 0) |
+++                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
+++                                (get_pcm(s, x, y) ? 4 : 0) |
+++                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
+++                            if (no_f == 0xf)
+++                                continue;
+++                        }
+++
+++                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
+++                                                       s->frame->linesize[1],
+++                                                       tc4,
+++                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+++                                                       no_f);
+++                    }
+++                }
+++
+++                if (y == 0)
+++                    continue;
+++
+++                // horizontal filtering chroma
+++                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+++                x_end2 = x_end;
+++                if (x_end != s->ps.sps->width)
+++                    x_end2 = x_end - 8 * h;
+++
+++                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
+++                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+++                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+++                    if ((bs0 == 2) || (bs1 == 2)) {
+++                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
+++                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
+++                        const uint32_t tc4 =
+++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
+++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++                        unsigned int no_f = 0;
+++
+++                        if (tc4 == 0)
+++                            continue;
+++
+++                        if (pcmf) {
+++                            no_f =
+++                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
+++                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
+++                                (get_pcm(s, x,         y)     ? 4 : 0) |
+++                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
+++
+++                            if (no_f == 0xf)
+++                                continue;
+++                        }
+++
+++                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
+++                                                             s->frame->linesize[1],
+++                                                             tc4, no_f);
+++                    }
+++                }
+++            }
+++        }
+++        else
+++#endif
++         for (chroma = 1; chroma <= 2; chroma++) {
++             int h = 1 << s->ps.sps->hshift[chroma];
++             int v = 1 << s->ps.sps->vshift[chroma];
++@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
++                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
++-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+++                        src =
+++#ifdef RPI
+++                            rpi_sliced_frame(s->frame) ?
+++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#endif
+++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
++                         if (pcmf) {
++                             no_p[0] = get_pcm(s, x - 1, y);
++                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
++@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                                                                     s->frame->linesize[chroma],
+                                                                     c_tc, no_p, no_q);
+                          } else
+@@ -6367,7 +8381,21 @@ index 1f33b0c..55a0315 100644
+                      }
+                  }
+  
+-@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
++                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
++-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+++                        src =
+++#ifdef RPI
+++                            rpi_sliced_frame(s->frame) ?
+++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#endif
+++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++                         if (pcmf) {
++                             no_p[0] = get_pcm(s, x,           y - 1);
++                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
++@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                                                                     s->frame->linesize[chroma],
+                                                                     c_tc, no_p, no_q);
+                          } else
+@@ -6387,7 +8415,7 @@ index 1f33b0c..55a0315 100644
+                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
+                                                                   s->frame->linesize[chroma],
+                                                                   c_tc, no_p, no_q);
+-@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+      }
+  }
+  
+@@ -6457,7 +8485,7 @@ index 1f33b0c..55a0315 100644
+  
+  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+                                             int log2_trafo_size)
+-@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
+      int min_pu_width     = s->ps.sps->min_pu_width;
+      int min_tu_width     = s->ps.sps->min_tb_width;
+@@ -6467,8 +8495,9 @@ index 1f33b0c..55a0315 100644
+ -    int i, j, bs;
+ +    int i, j;
+ +    RefPicList *rpl      = s->ref->refPicList;
+-+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
+-+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
+++    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
+++    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
+++    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
+ +    int y_pu             = y0 >> log2_min_pu_size;
+ +    int x_pu             = x0 >> log2_min_pu_size;
+ +    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+@@ -6482,7 +8511,7 @@ index 1f33b0c..55a0315 100644
+  
+      boundary_upper = y0 > 0 && !(y0 & 7);
+      if (boundary_upper &&
+-@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+          boundary_upper = 0;
+  
+@@ -6559,7 +8588,7 @@ index 1f33b0c..55a0315 100644
+      boundary_left = x0 > 0 && !(x0 & 7);
+      if (boundary_left &&
+          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
+-@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+          boundary_left = 0;
+  
+@@ -6574,9 +8603,7 @@ index 1f33b0c..55a0315 100644
+ -        int xq_pu =  x0      >> log2_min_pu_size;
+ -        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+ -        int xq_tu =  x0      >> log2_min_tu_size;
+-+                               rpl;
+-+        MvField *left = curr - 1;
+- 
++-
+ -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+ -                int y_pu      = (y0 + i) >> log2_min_pu_size;
+ -                int y_tu      = (y0 + i) >> log2_min_tu_size;
+@@ -6594,18 +8621,20 @@ index 1f33b0c..55a0315 100644
+ -                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+ -            }
+ -    }
+-+        if (is_intra) {
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+-+                bs[j * s->bs_width >> 2] = 2;
+- 
++-
+ -    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
+ -        RefPicList *rpl = s->ref->refPicList;
+--
+++                               rpl;
+++        MvField *left = curr - 1;
++ 
+ -        // bs for TU internal horizontal PU boundaries
+ -        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+ -            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+ -            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+--
+++        if (is_intra) {
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+++                bs[j * s->bs_width >> 2] = 2;
++ 
+ -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+ -                int x_pu = (x0 + i) >> log2_min_pu_size;
+ -                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+@@ -6662,137 +8691,42 @@ index 1f33b0c..55a0315 100644
+          }
+      }
+  }
+-@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+  #undef CB
+  #undef CR
+  
+-+#if !defined(RPI_FAST_CACHEFLUSH)
+-+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
+-+static void flush_buffer_y(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
+-+    gpu_cache_flush(&p);
+-+}
+-+
+-+static void flush_buffer_u(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
+-+    gpu_cache_flush(&p);
+-+}
+-+
+-+static void flush_buffer_v(const AVFrame * const frame) {
+-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
+-+    gpu_cache_flush(&p);
+-+}
+-+#endif
+-+#endif
+-+
+-+
+ +#ifdef RPI_DEBLOCK_VPU
+-+#error Not fixed yet
+-+
+ +// ff_hevc_flush_buffer_lines
+ +// flushes and invalidates all pixel rows in [start,end-1]
+ +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+ +{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+        struct vcsm_user_clean_invalid_s iocache = {};
+-+        int curr_y = start;
+-+        int n = end;
+-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+        int n_uv = n >> s->ps.sps->vshift[1];
+-+        int sz,base;
+-+        GPU_MEM_PTR_T p;
+-+        if (curr_uv < 0) curr_uv = 0;
+-+        if (n_uv<=curr_uv) { return; }
+-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+        base = s->frame->linesize[1] * curr_uv;
+-+        if (flush_chroma) {
+-+          p = get_gpu_mem_ptr_u(s->frame);
+-+          iocache.s[0].handle = p.vcsm_handle;
+-+          iocache.s[0].cmd = 3; // clean+invalidate
+-+          iocache.s[0].addr = (int)p.arm + base;
+-+          iocache.s[0].size  = sz;
+-+          p = get_gpu_mem_ptr_v(s->frame);
+-+          iocache.s[1].handle = p.vcsm_handle;
+-+          iocache.s[1].cmd = 3; // clean+invalidate
+-+          iocache.s[1].addr = (int)p.arm + base;
+-+          iocache.s[1].size  = sz;
+-+        }
+-+        if (flush_luma) {
+-+          p = get_gpu_mem_ptr_y(s->frame);
+-+          sz = s->frame->linesize[0] * (n-curr_y);
+-+          base = s->frame->linesize[0] * curr_y;
+-+          iocache.s[2].handle = p.vcsm_handle;
+-+          iocache.s[2].cmd = 3; // clean+invalidate
+-+          iocache.s[2].addr = (int)p.arm + base;
+-+          iocache.s[2].size  = sz;
+-+        }
+-+        vcsm_clean_invalid( &iocache );
+-+#else
+-+        if (flush_chroma) {
+-+          flush_buffer_u(s->frame);
+-+          flush_buffer_v(s->frame);
+-+        }
+-+        if (flush_luma) {
+-+          flush_buffer_y(s->frame);
+-+        }
+-+#endif
+++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
+++    rpi_cache_flush_finish(rfe);
+ +}
+ +#endif
+ +
+-+#ifdef RPI_INTER_QPU
+-+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
+++#if RPI_INTER
+++
+++// Flush some lines of a reference frames
+++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
+ +{
+ +    if (s->enable_rpi && s->used_for_ref) {
+-+      // TODO make this use ff_hevc_flush_buffer_lines
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+        struct vcsm_user_clean_invalid_s iocache = {};
+-+        int curr_y = ((int *)f->progress->data)[0];
+-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
+-+        int n_uv = n >> s->ps.sps->vshift[1];
+-+        int sz,base;
+-+        GPU_MEM_PTR_T p;
+-+        if (curr_uv < 0) curr_uv = 0;
+-+        if (n_uv<=curr_uv) { return; }
+-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
+-+        base = s->frame->linesize[1] * curr_uv;
+-+        p = get_gpu_mem_ptr_u(s->frame);
+-+        iocache.s[0].handle = p.vcsm_handle;
+-+        iocache.s[0].cmd = 3; // clean+invalidate
+-+        iocache.s[0].addr = (int)p.arm + base;
+-+        iocache.s[0].size  = sz;
+-+        p = get_gpu_mem_ptr_v(s->frame);
+-+        iocache.s[1].handle = p.vcsm_handle;
+-+        iocache.s[1].cmd = 3; // clean+invalidate
+-+        iocache.s[1].addr = (int)p.arm + base;
+-+        iocache.s[1].size  = sz;
+-+
+-+#ifdef RPI_LUMA_QPU
+-+        p = get_gpu_mem_ptr_y(s->frame);
+-+        sz = s->frame->linesize[0] * (n-curr_y);
+-+        base = s->frame->linesize[0] * curr_y;
+-+        iocache.s[2].handle = p.vcsm_handle;
+-+        iocache.s[2].cmd = 3; // clean+invalidate
+-+        iocache.s[2].addr = (int)p.arm + base;
+-+        iocache.s[2].size  = sz;
+-+#endif
+-+        vcsm_clean_invalid( &iocache );
+-+#else
+-+        flush_buffer_u(s->frame);
+-+        flush_buffer_v(s->frame);
+-+#ifdef RPI_LUMA_QPU
+-+        flush_buffer_y(s->frame);
+-+#endif
+-+
+-+#endif
+-+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
+-+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
+-+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
+++        const int d0 = ((int *)f->progress->data)[0];
+++        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
+++
+++        if (curr_y < (unsigned int)f->f->height) {
+++            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
+++            rpi_cache_flush_finish(rfe);
+++        }
+ +    }
+ +}
+ +#endif
+ +
+ +#ifdef RPI_DEBLOCK_VPU
+-+#error XXX
+ +/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+ +static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+ +{
+@@ -6821,16 +8755,19 @@ index 1f33b0c..55a0315 100644
+ +  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+ +  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+ +  s->dvq->vpu_cmds_arm[2][5] = 4;
+++
+ +  // Call VPU
+-+  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
+++  {
+++      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
+++      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
+++      vpu_qpu_job_finish(vqj);
+++  }
+ +
+ +  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+ +  s->dvq = s->dvq_ents + s->dvq_n;
+ +
+-+  if (s->dvq->cmd_id != -1) {
+-+      vpu_wait(s->dvq->cmd_id);
+-+      s->dvq->cmd_id = -1;
+-+  }
+++  vpu_qpu_wait(&s->dvq->cmd_id);
+ +}
+ +
+ +#endif
+@@ -6859,14 +8796,14 @@ index 1f33b0c..55a0315 100644
+      if (s->ps.sps->sao_enabled) {
+          int y_end = y >= s->ps.sps->height - ctb_size;
+          if (y && x)
+-@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+              sao_filter_CTB(s, x - ctb_size, y);
+          if (y && x_end) {
+              sao_filter_CTB(s, x, y - ctb_size);
+ -            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type & FF_THREAD_FRAME ) {
+-+#ifdef RPI_INTER_QPU
+-+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
+++            if (s->threads_type == FF_THREAD_FRAME ) {
+++#if RPI_INTER
+++                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
+ +#endif
+                  ff_thread_report_progress(&s->ref->tf, y, 0);
+ +            }
+@@ -6874,14 +8811,14 @@ index 1f33b0c..55a0315 100644
+          if (x_end && y_end) {
+              sao_filter_CTB(s, x , y);
+ -            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type & FF_THREAD_FRAME ) {
+-+#ifdef RPI_INTER_QPU
+-+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
+++            if (s->threads_type == FF_THREAD_FRAME ) {
+++#if RPI_INTER
+++                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
+ +#endif
+                  ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+ +            }
+ +        }
+-+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
+++    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
+ +        //int newh = y + ctb_size - 4;
+ +        //int currh = s->ref->tf.progress->data[0];
+ +        //if (((y + ctb_size)&63)==0)
+@@ -6892,15 +8829,15 @@ index 1f33b0c..55a0315 100644
+ +            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+ +          }
+ +        } else {
+-+#ifdef RPI_INTER_QPU
+-+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+++#if RPI_INTER
+++          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+ +#endif
+ +          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+          }
+ -    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+ +#else
+-+#ifdef RPI_INTER_QPU
+-+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
+++#if RPI_INTER
+++        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+ +        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+ +#endif
+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+@@ -6910,10 +8847,23 @@ index 1f33b0c..55a0315 100644
+  
+  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+ diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
+-index 83f2ec2..6882a8d 100644
++index 83f2ec2..bcf53dc 100644
+ --- a/libavcodec/hevc_ps.c
+ +++ b/libavcodec/hevc_ps.c
+-@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
++     switch (sps->bit_depth) {
++     case 8:
++         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+++#if RPI_HEVC_SAND
+++        // *** Horrid kludge s.t. we start out with sand format
+++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
+++#else
++         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+++#endif
++         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
++         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
++        break;
++@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+      sps->amp_enabled_flag = get_bits1(gb);
+      sps->sao_enabled      = get_bits1(gb);
+  
+@@ -6923,7 +8873,7 @@ index 83f2ec2..6882a8d 100644
+      if (sps->pcm_enabled_flag) {
+          sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+ diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
+-index 9d773d9..a6534a9 100644
++index 9d773d9..c4d7250 100644
+ --- a/libavcodec/hevcdsp.c
+ +++ b/libavcodec/hevcdsp.c
+ @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+@@ -7047,7 +8997,68 @@ index 9d773d9..a6534a9 100644
+  void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+  {
+  #undef FUNC
+-@@ -257,6 +371,8 @@ int i = 0;
++@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
++     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++ 
+++#ifndef RPI
+++#define SLICED_LOOP_FILTERS(depth)
+++#else
+++#define SLICED_LOOP_FILTERS(depth)\
+++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
+++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
+++#endif
+++
+++
++ #define HEVC_DSP(depth)                                                     \
++     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
++     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
++@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
++     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
++     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);                 \
+++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
+++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
+++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
+++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
+++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
+++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
+++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
+++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
++     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
++     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
++     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
++@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
++     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
++                                                                                \
+++    hevcdsp->sao_band_filter_c[0] =                                            \
+++    hevcdsp->sao_band_filter_c[1] =                                            \
+++    hevcdsp->sao_band_filter_c[2] =                                            \
+++    hevcdsp->sao_band_filter_c[3] =                                            \
+++    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
+++    hevcdsp->sao_edge_filter_c[0] =                                            \
+++    hevcdsp->sao_edge_filter_c[1] =                                            \
+++    hevcdsp->sao_edge_filter_c[2] =                                            \
+++    hevcdsp->sao_edge_filter_c[3] =                                            \
+++    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
+++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
+++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
+++                                                                               \
++     QPEL_FUNCS(depth);                                                         \
++     QPEL_UNI_FUNCS(depth);                                                     \
++     QPEL_BI_FUNCS(depth);                                                      \
++@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     EPEL_UNI_FUNCS(depth);                                                     \
++     EPEL_BI_FUNCS(depth);                                                      \
++                                                                                \
+++    SLICED_LOOP_FILTERS(depth);                                                \
++     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
++     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
++     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
++@@ -257,6 +404,8 @@ int i = 0;
+          break;
+      }
+  
+@@ -7057,10 +9068,10 @@ index 9d773d9..a6534a9 100644
+          ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+      if (ARCH_ARM)
+ diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
+-index 9f1f6dd..e221e54 100644
++index 9f1f6dd..639ecf1 100644
+ --- a/libavcodec/hevcdsp.h
+ +++ b/libavcodec/hevcdsp.h
+-@@ -42,6 +42,17 @@ typedef struct SAOParams {
++@@ -42,11 +42,26 @@ typedef struct SAOParams {
+      uint8_t type_idx[3];    ///< sao_type_idx
+  } SAOParams;
+  
+@@ -7078,21 +9089,742 @@ index 9f1f6dd..e221e54 100644
+  typedef struct HEVCDSPContext {
+      void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+                      struct GetBitContext *gb, int pcm_bit_depth);
+-@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
+++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+++                    struct GetBitContext *gb, int pcm_bit_depth);
++ 
++-    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+++    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+++    void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+++    void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++ 
++     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
++ 
++@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext {
++ 
++     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                               int width, int height);
++ 
++     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+++    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++ 
++     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++ 
++     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                     int height, intptr_t mx, intptr_t my, int width);
++@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext {
+      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+                                          int32_t *tc, uint8_t *no_p,
+                                          uint8_t *no_q);
+++#ifdef RPI
+++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+++                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                                 const uint8_t no_p[2], const uint8_t no_q[2],
+++                                 uint8_t * _pix_l);
+++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                                 unsigned int no_f);
+++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                                 uint8_t * src_l,
+++                                 unsigned int no_f);
+++
+++#endif
+++
+ +    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+ +                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+ +                                               MvField *curr, MvField *neigh, uint8_t *bs);
+  } HEVCDSPContext;
+  
+  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
++index b840d17..32b9e47 100644
++--- a/libavcodec/hevcdsp_template.c
+++++ b/libavcodec/hevcdsp_template.c
++@@ -26,6 +26,9 @@
++ #include "bit_depth_template.c"
++ #include "hevcdsp.h"
++ 
+++#ifdef RPI
+++#include "rpi_zc.h"
+++#endif
++ 
++ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                           GetBitContext *gb, int pcm_bit_depth)
++@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
++     }
++ }
++ 
+++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+++                          GetBitContext *gb, int pcm_bit_depth)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x++)
+++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+++        dst += stride;
+++    }
+++
+++    dst = (pixel *)_dst + 1;
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x++)
+++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+++        dst += stride;
+++    }
+++}
+++
+++
++ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
++                                                      ptrdiff_t stride, int size)
++ {
++@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
++     }
++ }
++ 
+++static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res,
+++                                                ptrdiff_t stride, int size)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x] = av_clip_pixel(dst[x] + *res);
+++            res++;
+++        }
+++        dst += stride;
+++    }
+++}
+++
++ static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
++                                        ptrdiff_t stride)
++ {
++@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
++     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
++ }
++ 
+++// -- U -- (plaited)
+++
+++static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst, res, stride, 4);
+++}
+++
+++static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst, res, stride, 8);
+++}
+++
+++static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst, res, stride, 16);
+++}
+++
+++static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst, res, stride, 32);
+++}
+++
+++// -- V -- (plaited)
+++
+++static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst + 1, res, stride, 4);
+++}
+++
+++static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst + 1, res, stride, 8);
+++}
+++
+++static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst + 1, res, stride, 16);
+++}
+++
+++static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_uv)(_dst + 1, res, stride, 32);
+++}
+++
++ 
++ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++ {
++@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++     pixel *src = (pixel *)_src;
++-    int16_t *sao_offset_val = sao->offset_val[c_idx];
++     int sao_eo_class    = sao->eo_class[c_idx];
++     int init_x = 0, width = _width, height = _height;
++ 
++@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++ 
++     if (sao_eo_class != SAO_EO_VERT) {
++         if (borders[0]) {
++-            int offset_val = sao_offset_val[0];
++             for (y = 0; y < height; y++) {
++-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+++                dst[y * stride_dst] = src[y * stride_src];
++             }
++             init_x = 1;
++         }
++         if (borders[2]) {
++-            int offset_val = sao_offset_val[0];
++             int offset     = width - 1;
++             for (x = 0; x < height; x++) {
++-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++             }
++             width--;
++         }
++     }
++     if (sao_eo_class != SAO_EO_HORIZ) {
++         if (borders[1]) {
++-            int offset_val = sao_offset_val[0];
++             for (x = init_x; x < width; x++)
++-                dst[x] = av_clip_pixel(src[x] + offset_val);
+++                dst[x] = src[x];
++         }
++         if (borders[3]) {
++-            int offset_val   = sao_offset_val[0];
++-            int y_stride_dst = stride_dst * (height - 1);
++-            int y_stride_src = stride_src * (height - 1);
+++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++             for (x = init_x; x < width; x++)
++-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+++                dst[x + y_stride_dst] = src[x + y_stride_src];
++             height--;
++         }
++     }
++@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++     pixel *src = (pixel *)_src;
++-    int16_t *sao_offset_val = sao->offset_val[c_idx];
++     int sao_eo_class    = sao->eo_class[c_idx];
++     int init_x = 0, init_y = 0, width = _width, height = _height;
++ 
++@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ 
++     if (sao_eo_class != SAO_EO_VERT) {
++         if (borders[0]) {
++-            int offset_val = sao_offset_val[0];
++             for (y = 0; y < height; y++) {
++-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+++                dst[y * stride_dst] = src[y * stride_src];
++             }
++             init_x = 1;
++         }
++         if (borders[2]) {
++-            int offset_val = sao_offset_val[0];
++             int offset     = width - 1;
++             for (x = 0; x < height; x++) {
++-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++             }
++             width--;
++         }
++     }
++     if (sao_eo_class != SAO_EO_HORIZ) {
++         if (borders[1]) {
++-            int offset_val = sao_offset_val[0];
++             for (x = init_x; x < width; x++)
++-                dst[x] = av_clip_pixel(src[x] + offset_val);
+++                dst[x] = src[x];
++             init_y = 1;
++         }
++         if (borders[3]) {
++-            int offset_val   = sao_offset_val[0];
++-            int y_stride_dst = stride_dst * (height - 1);
++-            int y_stride_src = stride_src * (height - 1);
+++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++             for (x = init_x; x < width; x++)
++-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+++                dst[x + y_stride_dst] = src[x + y_stride_src];
++             height--;
++         }
++     }
++@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++     }
++ }
++ 
+++
+++// --- Plaited chroma versions
+++
+++#if BIT_DEPTH != 8
+++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+++{
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++#else
+++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+++{
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int offset_table_u[32] = { 0 };
+++    int offset_table_v[32] = { 0 };
+++    int k, y, x;
+++    int shift  = BIT_DEPTH - 5;
+++
+++    stride_dst /= sizeof(pixel);
+++    stride_src /= sizeof(pixel);
+++    width *= 2;
+++
+++    for (k = 0; k < 4; k++)
+++    {
+++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+++    }
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x += 2)
+++        {
+++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
+++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
+++        }
+++        dst += stride_dst;
+++        src += stride_src;
+++    }
+++}
+++#endif
+++
+++#if BIT_DEPTH != 8
+++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+++                                  int eo, int width, int height) {
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++#else
+++
+++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+++                                  int eo, int width, int height) {
+++
+++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+++    static const int8_t pos[4][2][2] = {
+++        { { -1,  0 }, {  1, 0 } }, // horizontal
+++        { {  0, -1 }, {  0, 1 } }, // vertical
+++        { { -1, -1 }, {  1, 1 } }, // 45 degree
+++        { {  1, -1 }, { -1, 1 } }, // 135 degree
+++    };
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int a_stride, b_stride;
+++    int x, y;
+++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+++    stride_dst /= sizeof(pixel);
+++    width *= 2;
+++
+++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x += 2) {
+++            int diff0u = CMP(src[x], src[x + a_stride]);
+++            int diff1u = CMP(src[x], src[x + b_stride]);
+++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
+++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
+++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+++        }
+++        src += stride_src;
+++        dst += stride_dst;
+++    }
+++}
+++#endif
+++
+++#if BIT_DEPTH != 8
+++static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
+++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+++                                    int *borders, int _width, int _height,
+++                                    int c_idx, uint8_t *vert_edge,
+++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+++{
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
+++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+++                                    int *borders, int _width, int _height,
+++                                    int c_idx, uint8_t *vert_edge,
+++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+++{
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++#else
+++// Any old 2 byte 'normal' restore will work for these
+++#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
+++#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
+++#endif
+++
+++
++ #undef CMP
++ 
++ ////////////////////////////////////////////////////////////////////////////////
++@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ #undef TQ1
++ #undef TQ2
++ #undef TQ3
+++
+++#ifdef RPI
+++
+++// line zero
+++#define P3 pix_l[0 * xstride]
+++#define P2 pix_l[1 * xstride]
+++#define P1 pix_l[2 * xstride]
+++#define P0 pix_l[3 * xstride]
+++#define Q0 pix_r[0 * xstride]
+++#define Q1 pix_r[1 * xstride]
+++#define Q2 pix_r[2 * xstride]
+++#define Q3 pix_r[3 * xstride]
+++
+++// line three. used only for deblocking decision
+++#define TP3 pix_l[0 * xstride + 3 * ystride]
+++#define TP2 pix_l[1 * xstride + 3 * ystride]
+++#define TP1 pix_l[2 * xstride + 3 * ystride]
+++#define TP0 pix_l[3 * xstride + 3 * ystride]
+++#define TQ0 pix_r[0 * xstride + 3 * ystride]
+++#define TQ1 pix_r[1 * xstride + 3 * ystride]
+++#define TQ2 pix_r[2 * xstride + 3 * ystride]
+++#define TQ3 pix_r[3 * xstride + 3 * ystride]
+++
+++// This is identical to hevc_loop_filter_luma except that the P/Q
+++// components are on separate pointers
+++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+++                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
+++                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
+++                                 uint8_t * _pix_l)
+++{
+++    int d, j;
+++    pixel *pix_l        = (pixel *)_pix_l;
+++    pixel *pix_r        = (pixel *)_pix_r;
+++    const ptrdiff_t xstride = 1;
+++    const ptrdiff_t ystride = _stride / sizeof(pixel);
+++
+++    beta <<= BIT_DEPTH - 8;
+++
+++    for (j = 0; j < 2; j++) {
+++        const int dp0  = abs(P2  - 2 * P1  + P0);
+++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+++        const int d0   = dp0 + dq0;
+++        const int d3   = dp3 + dq3;
+++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
+++        const int no_p = _no_p[j];
+++        const int no_q = _no_q[j];
+++
+++        if (d0 + d3 >= beta) {
+++            pix_l += 4 * ystride;
+++            pix_r += 4 * ystride;
+++            continue;
+++        } else {
+++            const int beta_3 = beta >> 3;
+++            const int beta_2 = beta >> 2;
+++            const int tc25   = ((tc * 5 + 1) >> 1);
+++
+++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+++                // strong filtering
+++                const int tc2 = tc << 1;
+++                for (d = 0; d < 4; d++) {
+++                    const int p3 = P3;
+++                    const int p2 = P2;
+++                    const int p1 = P1;
+++                    const int p0 = P0;
+++                    const int q0 = Q0;
+++                    const int q1 = Q1;
+++                    const int q2 = Q2;
+++                    const int q3 = Q3;
+++                    if (!no_p) {
+++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+++                    }
+++                    if (!no_q) {
+++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+++                    }
+++                    pix_l += ystride;
+++                    pix_r += ystride;
+++                }
+++            } else { // normal filtering
+++                int nd_p = 1;
+++                int nd_q = 1;
+++                const int tc_2 = tc >> 1;
+++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+++                    nd_p = 2;
+++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+++                    nd_q = 2;
+++
+++                for (d = 0; d < 4; d++) {
+++                    const int p2 = P2;
+++                    const int p1 = P1;
+++                    const int p0 = P0;
+++                    const int q0 = Q0;
+++                    const int q1 = Q1;
+++                    const int q2 = Q2;
+++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+++                    if (abs(delta0) < 10 * tc) {
+++                        delta0 = av_clip(delta0, -tc, tc);
+++                        if (!no_p)
+++                            P0 = av_clip_pixel(p0 + delta0);
+++                        if (!no_q)
+++                            Q0 = av_clip_pixel(q0 - delta0);
+++                        if (!no_p && nd_p > 1) {
+++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+++                            P1 = av_clip_pixel(p1 + deltap1);
+++                        }
+++                        if (!no_q && nd_q > 1) {
+++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+++                            Q1 = av_clip_pixel(q1 + deltaq1);
+++                        }
+++                    }
+++                    pix_l += ystride;
+++                    pix_r += ystride;
+++                }
+++            }
+++        }
+++    }
+++}
+++
+++#undef TP3
+++#undef TP2
+++#undef TP1
+++#undef TP0
+++#undef TQ0
+++#undef TQ1
+++#undef TQ2
+++#undef TQ3
+++
+++#undef P3
+++#undef P2
+++#undef P1
+++#undef P0
+++#undef Q0
+++#undef Q1
+++#undef Q2
+++#undef Q3
+++
+++#define P1 pix_l[0 * xstride]
+++#define P0 pix_l[1 * xstride]
+++#define Q0 pix_r[0 * xstride]
+++#define Q1 pix_r[1 * xstride]
+++
+++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+++                                          ptrdiff_t _ystride, const int32_t *_tc,
+++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+++{
+++    int d, j, no_p, no_q;
+++    pixel *pix_l        = (pixel *)_pix_l;
+++    pixel *pix_r        = (pixel *)_pix_r;
+++    ptrdiff_t xstride = _xstride / sizeof(pixel);
+++    ptrdiff_t ystride = _ystride / sizeof(pixel);
+++
+++    for (j = 0; j < 2; j++) {
+++        const int tc = _tc[j] << (BIT_DEPTH - 8);
+++        if (tc <= 0) {
+++            pix_l += 4 * ystride;
+++            pix_r += 4 * ystride;
+++            continue;
+++        }
+++        no_p = _no_p[j];
+++        no_q = _no_q[j];
+++
+++        for (d = 0; d < 4; d++) {
+++            int delta0;
+++            const int p1 = P1;
+++            const int p0 = P0;
+++            const int q0 = Q0;
+++            const int q1 = Q1;
+++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+++            if (!no_p)
+++                P0 = av_clip_pixel(p0 + delta0);
+++            if (!no_q)
+++                Q0 = av_clip_pixel(q0 - delta0);
+++            pix_l += ystride;
+++            pix_r += ystride;
+++        }
+++    }
+++}
+++
+++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+++                                 unsigned int no_f)
+++{
+++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+++}
+++
+++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                                 uint8_t * src_l,
+++                                 unsigned int no_f)
+++{
+++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+++}
+++
+++#undef P1
+++#undef P0
+++#undef Q0
+++#undef Q1
+++
+++
+++#endif
+++
++diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
++index 02c1766..cea16ea 100644
++--- a/libavcodec/hevcpred.c
+++++ b/libavcodec/hevcpred.c
++@@ -24,6 +24,7 @@
++ 
++ #include "hevcpred.h"
++ 
+++#define PRED_C 0
++ #define BIT_DEPTH 8
++ #include "hevcpred_template.c"
++ #undef BIT_DEPTH
++@@ -39,13 +40,37 @@
++ #define BIT_DEPTH 12
++ #include "hevcpred_template.c"
++ #undef BIT_DEPTH
+++#undef PRED_C
+++
+++#ifdef RPI
+++#define PRED_C 1
+++#define BIT_DEPTH 8
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 9
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 10
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 12
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++#undef PRED_C
+++#endif
++ 
++ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
++ {
++ #undef FUNC
++ #define FUNC(a, depth) a ## _ ## depth
++ 
++-#define HEVC_PRED(depth)                                \
+++#undef FUNCC
+++#define FUNCC(a, depth) a ## _ ## depth ## _c
+++
+++#define HEVC_PRED_Y(depth)                                \
++     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
++     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
++     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
++@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
++     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
++ 
+++#define HEVC_PRED_C(depth)                                \
+++    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
+++    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
+++    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
+++    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
+++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
+++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
+++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
+++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
+++    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
+++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+++
+++#ifdef RPI
+++#define HEVC_PRED(depth) \
+++    HEVC_PRED_Y(depth); \
+++    HEVC_PRED_C(depth);
+++#else
+++#define HEVC_PRED(depth) \
+++    HEVC_PRED_Y(depth);
+++#endif
+++
++     switch (bit_depth) {
++     case 9:
++         HEVC_PRED(9);
++diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
++index eb17663..00ba3f9 100644
++--- a/libavcodec/hevcpred.h
+++++ b/libavcodec/hevcpred.h
++@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
++     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++                             const uint8_t *left, ptrdiff_t stride,
++                             int c_idx, int mode);
+++#ifdef RPI
+++    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+++
+++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+++                           const uint8_t *left, ptrdiff_t stride);
+++    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+++                    ptrdiff_t stride, int log2_size, int c_idx);
+++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+++                            const uint8_t *left, ptrdiff_t stride,
+++                            int c_idx, int mode);
+++#endif
++ } HEVCPredContext;
++ 
++ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+ diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 6ae87cc..28d2653 100644
++index 6ae87cc..c14dddd 100644
+ --- a/libavcodec/hevcpred_template.c
+ +++ b/libavcodec/hevcpred_template.c
+-@@ -20,6 +20,8 @@
++@@ -20,13 +20,55 @@
+   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+   */
+  
+@@ -7101,7 +9833,54 @@ index 6ae87cc..28d2653 100644
+  #include "libavutil/pixdesc.h"
+  
+  #include "bit_depth_template.c"
+-@@ -69,8 +71,11 @@ do {                                  \
++ #include "hevcpred.h"
++ 
+++#ifdef RPI
+++#include "rpi_zc.h"
+++#endif
+++
+++#define DUMP_PRED 0
+++
++ #define POS(x, y) src[(x) + stride * (y)]
++ 
+++#if PRED_C
+++
+++typedef uint8_t (* c8_dst_ptr_t)[2];
+++typedef const uint8_t (* c8_src_ptr_t)[2];
+++
+++#if BIT_DEPTH == 8
+++#undef BIT_DEPTH
+++#define BIT_DEPTH 16
+++#include "bit_depth_template.c"
+++#undef FUNC
+++#define FUNC(a) FUNC3(a, 8, _c)
+++#else
+++#undef FUNC
+++#define FUNC FUNCC
+++#endif
+++
+++#endif
+++
+++#if DUMP_PRED
+++#ifndef DEBUG_ONCE
+++#define DEBUG_ONCE
+++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+++{
+++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+++        for (unsigned int x = 0; x != size; x++) {
+++            printf("%4d", data[x * 2]);
+++        }
+++        printf("\n");
+++    }
+++    printf("\n");
+++}
+++#endif
+++#endif
+++
++ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
++                                               int log2_size, int c_idx)
++ {
++@@ -69,8 +111,11 @@ do {                                  \
+                  AV_WN4P(&ptr[i], a);                                           \
+              else                                                               \
+                  a = PIXEL_SPLAT_X4(ptr[i + 3])
+@@ -7114,17 +9893,399 @@ index 6ae87cc..28d2653 100644
+      int i;
+      int hshift = s->ps.sps->hshift[c_idx];
+      int vshift = s->ps.sps->vshift[c_idx];
+-@@ -114,6 +119,10 @@ do {                                  \
++@@ -79,15 +124,23 @@ do {                                  \
++     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
++     int size_in_luma_v = size << vshift;
++     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
++-    int x = x0 >> hshift;
++-    int y = y0 >> vshift;
+++    const int x = x0 >> hshift;
+++    const int y = y0 >> vshift;
++     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++ 
++     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
++ 
++-    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+++    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+++#if defined(RPI)
+++    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
+++            (pixel*)s->frame->data[c_idx] + x + y * stride :
+++        c_idx == 0 ?
+++            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
+++            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
+++#else
++     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+++#endif
++ 
++     int min_pu_width = s->ps.sps->min_pu_width;
++ 
++@@ -95,14 +148,20 @@ do {                                  \
++                               lc->tu.intra_pred_mode;
++     pixel4 a;
++     pixel  left_array[2 * MAX_TB_SIZE + 1];
+++#if !PRED_C
++     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+++#endif
++     pixel  top_array[2 * MAX_TB_SIZE + 1];
+++#if !PRED_C
++     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+++#endif
++ 
++     pixel  *left          = left_array + 1;
++     pixel  *top           = top_array  + 1;
+++#if !PRED_C
++     pixel  *filtered_left = filtered_left_array + 1;
++     pixel  *filtered_top  = filtered_top_array  + 1;
+++#endif
++     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
++     int cand_left        = lc->na.cand_left;
++     int cand_up_left     = lc->na.cand_up_left;
++@@ -114,6 +173,26 @@ do {                                  \
+      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+                             (x0 + size_in_luma_h)) >> hshift;
+  
+++    pixel * src_l = src - 1;
+++    pixel * src_u = src - stride;
+++    pixel * src_ur = src_u + size;
+++
+ +#ifdef DISABLE_INTRA
+ +    return;
+ +#endif
+ +
+++#if defined(RPI)
+++    if (s->frame->format == AV_PIX_FMT_SAND128) {
+++        const AVFrame * const frame = s->frame;
+++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+++        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
+++        if ((x & mask) == 0)
+++            src_l -= stripe_adj;
+++        if (((x + size) & mask) == 0)
+++            src_ur += stripe_adj;
+++    }
+++#endif
+++
+      if (s->ps.pps->constrained_intra_pred_flag == 1) {
+          int size_in_luma_pu_v = PU(size_in_luma_v);
+          int size_in_luma_pu_h = PU(size_in_luma_h);
++@@ -163,23 +242,24 @@ do {                                  \
++         top[-1] = 128;
++     }
++     if (cand_up_left) {
++-        left[-1] = POS(-1, -1);
+++        left[-1] = src_l[-stride];
++         top[-1]  = left[-1];
++     }
++     if (cand_up)
++-        memcpy(top, src - stride, size * sizeof(pixel));
+++        // Always good - even with sand
+++        memcpy(top, src_u, size * sizeof(pixel));
++     if (cand_up_right) {
++-        memcpy(top + size, src - stride + size, size * sizeof(pixel));
++-        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
+++        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
+++        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
++                size - top_right_size);
++     }
++     if (cand_left)
++         for (i = 0; i < size; i++)
++-            left[i] = POS(-1, i);
+++            left[i] = src_l[stride * i];
++     if (cand_bottom_left) {
++         for (i = size; i < size + bottom_left_size; i++)
++-            left[i] = POS(-1, i);
++-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+++            left[i] = src_l[stride * i];
+++        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
++                size - bottom_left_size);
++     }
++ 
++@@ -268,7 +348,11 @@ do {                                  \
++             cand_up_left = 1;
++             cand_left    = 1;
++         } else { // No samples available
+++#if PRED_C && BIT_DEPTH == 16
+++            left[-1] = 0x8080;
+++#else
++             left[-1] = (1 << (BIT_DEPTH - 1));
+++#endif
++             EXTEND(top,  left[-1], 2 * size);
++             EXTEND(left, left[-1], 2 * size);
++         }
++@@ -287,6 +371,9 @@ do {                                  \
++     top[-1] = left[-1];
++ 
++     // Filtering process
+++    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
+++    // worry about chroma smoothing for that case
+++#if !PRED_C
++     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
++         if (mode != INTRA_DC && size != 4){
++             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
++@@ -342,13 +429,46 @@ do {                                  \
++                                            mode);
++         break;
++     }
+++#else
+++    switch (mode) {
+++    case INTRA_PLANAR:
+++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+++                                          (uint8_t *)left, stride);
+++        break;
+++    case INTRA_DC:
+++        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
+++                       (uint8_t *)left, stride, log2_size, c_idx);
+++        break;
+++    default:
+++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+++                                           (uint8_t *)left, stride, c_idx,
+++                                           mode);
+++        break;
+++    }
+++
+++#if DUMP_PRED
+++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+++#endif
+++#endif
++ }
++ 
+++#if !PRED_C || BIT_DEPTH == 16
++ #define INTRA_PRED(size)                                                            \
++ static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
++ {                                                                                   \
++     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
++ }
+++#else
+++#define INTRA_PRED(size)                                                            \
+++static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+++{                                                                                   \
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+++    abort();                                                                        \
+++}
+++#endif
++ 
++ INTRA_PRED(2)
++ INTRA_PRED(3)
++@@ -357,6 +477,7 @@ INTRA_PRED(5)
++ 
++ #undef INTRA_PRED
++ 
+++#if !PRED_C
++ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++                                   const uint8_t *_left, ptrdiff_t stride,
++                                   int trafo_size)
++@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
++             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
++                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
++ }
+++#else
+++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+++                                  const uint8_t * _left, ptrdiff_t stride,
+++                                  int trafo_size)
+++{
+++    int x, y;
+++    int size = 1 << trafo_size;
+++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
+++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
+++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
+++
+++    for (y = 0; y < size; y++, src += stride)
+++    {
+++        for (x = 0; x < size; x++)
+++        {
+++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
+++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
+++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+++        }
+++    }
+++}
+++#endif
++ 
+++#if !PRED_C || BIT_DEPTH == 16
++ #define PRED_PLANAR(size)\
++ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                        const uint8_t *left, ptrdiff_t stride)   \
++ {                                                                               \
++     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
++ }
+++#else
+++#define PRED_PLANAR(size)\
+++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+++                                       const uint8_t *left, ptrdiff_t stride)   \
+++{                                                                               \
+++    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
+++    abort();                                                                    \
+++}
+++#endif
++ 
++ PRED_PLANAR(0)
++ PRED_PLANAR(1)
++@@ -386,6 +540,7 @@ PRED_PLANAR(3)
++ 
++ #undef PRED_PLANAR
++ 
+++#if !PRED_C
++ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                           const uint8_t *_left,
++                           ptrdiff_t stride, int log2_size, int c_idx)
++@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++     }
++ }
+++#else
+++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+++                          const uint8_t *_left,
+++                          ptrdiff_t stride, int log2_size, int c_idx)
+++{
+++    unsigned int i, j;
+++    const unsigned int size = (1 << log2_size);
+++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
+++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
+++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
+++    unsigned int dc0 = size;
+++    unsigned int dc1 = size;
+++
+++    for (i = 0; i < size; i++)
+++    {
+++        dc0 += left[i][0] + top[i][0];
+++        dc1 += left[i][1] + top[i][1];
+++    }
+++
+++    dc0 >>= log2_size + 1;
+++    dc1 >>= log2_size + 1;
+++
+++    for (i = 0; i < size; i++, src += stride)
+++    {
+++        for (j = 0; j < size; ++j)
+++        {
+++            src[j][0] = dc0;
+++            src[j][1] = dc1;
++ 
+++        }
+++    }
+++}
+++#endif
+++
+++#ifndef ANGLE_CONSTS
+++#define ANGLE_CONSTS
+++static const int intra_pred_angle[] = {
+++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+++};
+++static const int inv_angle[] = {
+++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+++    -630, -910, -1638, -4096
+++};
+++#endif
+++
+++#if !PRED_C
++ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                 const uint8_t *_top,
++                                                 const uint8_t *_left,
++@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++     const pixel *top  = (const pixel *)_top;
++     const pixel *left = (const pixel *)_left;
++ 
++-    static const int intra_pred_angle[] = {
++-         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++-        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++-    };
++-    static const int inv_angle[] = {
++-        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++-        -630, -910, -1638, -4096
++-    };
++-
++     int angle = intra_pred_angle[mode - 2];
++     pixel ref_array[3 * MAX_TB_SIZE + 4];
++     pixel *ref_tmp = ref_array + size;
++@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++         }
++     }
++ }
+++#else
+++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+++                                                const uint8_t *_top,
+++                                                const uint8_t *_left,
+++                                                ptrdiff_t stride, int c_idx,
+++                                                int mode, int size)
+++{
+++    int x, y;
+++    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
+++    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
+++    c8_src_ptr_t left = (c8_src_ptr_t)_left;
+++
+++    const int angle = intra_pred_angle[mode - 2];
+++    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
+++    c8_dst_ptr_t ref_tmp = ref_array + size;
+++    c8_src_ptr_t ref;
+++    const int last = (size * angle) >> 5;
+++
+++    if (mode >= 18) {
+++        ref = top - 1;
+++        if (angle < 0 && last < -1) {
+++            memcpy(ref_tmp, top - 1, (size + 1) * 2);
+++            for (x = last; x <= -1; x++)
+++            {
+++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+++            }
+++            ref = (c8_src_ptr_t)ref_tmp;
+++        }
+++
+++        for (y = 0; y < size; y++, src += stride) {
+++            const int idx  = ((y + 1) * angle) >> 5;
+++            const int fact = ((y + 1) * angle) & 31;
+++            if (fact) {
+++                for (x = 0; x < size; ++x) {
+++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
+++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
+++                }
+++            } else {
+++                memcpy(src, ref + idx + 1, size * 2);
+++            }
+++        }
+++    } else {
+++        ref = left - 1;
+++        if (angle < 0 && last < -1) {
+++            memcpy(ref_tmp, left - 1, (size + 1) * 2);
+++            for (x = last; x <= -1; x++)
+++            {
+++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+++            }
+++            ref = (c8_src_ptr_t)ref_tmp;
+++        }
+++
+++        for (x = 0; x < size; x++, src++) {
+++            const int idx  = ((x + 1) * angle) >> 5;
+++            const int fact = ((x + 1) * angle) & 31;
+++            if (fact) {
+++                for (y = 0; y < size; y++) {
+++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
+++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
+++                }
+++            } else {
+++                for (y = 0; y < size; y++)
+++                {
+++                    src[y * stride][0] = ref[y + idx + 1][0];
+++                    src[y * stride][1] = ref[y + idx + 1][1];
+++                }
+++            }
+++        }
+++    }
+++}
+++#endif
++ 
++ static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++                                  const uint8_t *left,
+ diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+ index 099a8c5..bdff2d2 100644
+ --- a/libavcodec/mmaldec.c
+@@ -7169,6 +10330,87 @@ index 3adf28d..2f9195f 100644
+      if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+          s->codec_id == AV_CODEC_ID_MPEG4 &&
+          avctx->idct_algo == FF_IDCT_AUTO) {
++diff --git a/libavcodec/raw.c b/libavcodec/raw.c
++index bfa2537..1bca89e 100644
++--- a/libavcodec/raw.c
+++++ b/libavcodec/raw.c
++@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
++     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
++     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
++ 
+++    /* RPI */
+++#ifdef RPI
+++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+++#endif
+++
++     /* special */
++     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
++     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
++diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
++index d837056..81256b5 100644
++--- a/libavcodec/rawenc.c
+++++ b/libavcodec/rawenc.c
++@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
++     return 0;
++ }
++ 
+++static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off)
+++{
+++    for (int y = 0; y != frame->height / 2; ++y) {
+++        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+++            const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off;
+++            const int w = FFMIN(frame->linesize[0], frame->width - x) / 2;
+++            for (int i = 0; i < w; ++i)
+++                *dst++ = p[i * 2];
+++        }
+++    }
+++    return dst;
+++}
+++
+++static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+++                      const AVFrame *frame)
+++{
+++    int size = frame->width * frame->height * 3 / 2;
+++    uint8_t * dst;
+++    int ret;
+++
+++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+++        return ret;
+++
+++    dst = pkt->data;
+++
+++    // Luma is "easy"
+++    for (int y = 0; y != frame->height; ++y) {
+++        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+++            const int w = FFMIN(frame->linesize[0], frame->width - x);
+++            memcpy(dst,
+++                frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w);
+++            dst += w;
+++        }
+++    }
+++    // Chroma is dull
+++    dst = cpy_sand_c(dst, frame, 0);
+++    dst = cpy_sand_c(dst, frame, 1);
+++
+++    return 0;
+++}
+++
++ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
++                       const AVFrame *frame, int *got_packet)
++ {
++@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
++     if (ret < 0)
++         return ret;
++ 
+++    if (frame->format == AV_PIX_FMT_SAND128) {
+++        ret = raw_sand_as_yuv420(avctx, pkt, frame);
+++        *got_packet = (ret == 0);
+++        return ret;
+++    }
+++
++     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
++         return ret;
++     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+ diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+ new file mode 100644
+ index 0000000..4309f1c
+@@ -11170,10 +14412,10 @@ index 0000000..5543093
+ +  pop r6-r7, pc
+ diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+ new file mode 100644
+-index 0000000..3904efc
++index 0000000..0255f5d
+ --- /dev/null
+ +++ b/libavcodec/rpi_mailbox.c
+-@@ -0,0 +1,340 @@
++@@ -0,0 +1,149 @@
+ +/*
+ +Copyright (c) 2012, Broadcom Europe Ltd.
+ +All rights reserved.
+@@ -11201,6 +14443,8 @@ index 0000000..3904efc
+ +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ +*/
+ +
+++#ifdef RPI
+++
+ +#include <stdio.h>
+ +#include <string.h>
+ +#include <stdlib.h>
+@@ -11208,7 +14452,6 @@ index 0000000..3904efc
+ +#include <unistd.h>
+ +#include <assert.h>
+ +#include <stdint.h>
+-+#include <sys/mman.h>
+ +#include <sys/ioctl.h>
+ +
+ +#include <linux/ioctl.h>
+@@ -11218,137 +14461,29 @@ index 0000000..3904efc
+ +#define DEVICE_FILE_NAME "/dev/vcio"
+ +
+ +#include "rpi_mailbox.h"
+++//#include <interface/vctypes/vc_image_structs.h>
+ +
+-+#define PAGE_SIZE (4*1024)
+-+
+-+// Shared memory will not be cached in ARM cache
+-+void *mapmem_shared(unsigned base, unsigned size)
+-+{
+-+   int mem_fd;
+-+   unsigned offset = base % PAGE_SIZE;
+-+   base = base - offset;
+-+   /* open /dev/mem */
+-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+-+      return NULL;
+-+   }
+-+   void *mem = mmap(
+-+      0,
+-+      size,
+-+      PROT_READ|PROT_WRITE,
+-+      MAP_SHARED/*|MAP_FIXED*/,
+-+      mem_fd,
+-+      base);
+-+#ifdef DEBUG
+-+   printf("base=0x%x, mem=%p\n", base, mem);
+-+#endif
+-+   if (mem == MAP_FAILED) {
+-+      printf("mmap error %d\n", (int)mem);
+-+      return NULL;
+-+   }
+-+   close(mem_fd);
+-+   return (char *)mem + offset;
+-+}
+-+
+-+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
+-+void *mapmem_private(unsigned base, unsigned size)
+-+{
+-+   int mem_fd;
+-+   unsigned offset = base % PAGE_SIZE;
+-+   base = base - offset;
+-+   /* open /dev/mem */
+-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
+-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
+-+      return NULL;
+-+   }
+-+   void *mem = mmap(
+-+      0,
+-+      size,
+-+      PROT_READ|PROT_WRITE,
+-+      MAP_PRIVATE/*|MAP_FIXED*/,
+-+      mem_fd,
+-+      base);
+-+#ifdef DEBUG
+-+   printf("base=0x%x, mem=%p\n", base, mem);
+-+#endif
+-+   if (mem == MAP_FAILED) {
+-+      printf("mmap error %d\n", (int)mem);
+-+      return NULL;
+-+   }
+-+   close(mem_fd);
+-+   return (char *)mem + offset;
+-+}
+-+
+-+void unmapmem(void *addr, unsigned size)
+-+{
+-+   int s = munmap(addr, size);
+-+   if (s != 0) {
+-+      printf("munmap error %d\n", s);
+-+      exit (-1);
+-+   }
+-+}
+-+
+-+/*
+-+ * use ioctl to send mbox property message
+-+ */
+-+
+-+static int mbox_property(int file_desc, void *buf)
+-+{
+-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+-+
+-+   if (ret_val < 0) {
+-+      printf("ioctl_set_msg failed:%d\n", ret_val);
+-+   }
+-+
+-+#ifdef DEBUG
+-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+-+   for (i=0; i<size/4; i++)
+-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+-+#endif
+-+   return ret_val;
+-+}
+-+
+-+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+
+-+   p[i++] = 0x3000c; // (the tag id)
+-+   p[i++] = 12; // (size of the buffer)
+-+   p[i++] = 12; // (size of the data)
+-+   p[i++] = size; // (num bytes? or pages?)
+-+   p[i++] = align; // (alignment)
+-+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+-+
+-+unsigned mem_free(int file_desc, unsigned handle)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+++/*
+++ * use ioctl to send mbox property message
+++ */
+ +
+-+   p[i++] = 0x3000f; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = handle;
+++static int mbox_property(int file_desc, void *buf)
+++{
+++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++   if (ret_val < 0) {
+++      printf("ioctl_set_msg failed:%d\n", ret_val);
+++   }
+ +
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+++#ifdef DEBUG
+++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
+++   for (i=0; i<size/4; i++)
+++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
+++#endif
+++   return ret_val;
+ +}
+ +
+-+unsigned mem_lock(int file_desc, unsigned handle)
+++unsigned mbox_mem_lock(int file_desc, unsigned handle)
+ +{
+ +   int i=0;
+ +   unsigned p[32];
+@@ -11367,7 +14502,7 @@ index 0000000..3904efc
+ +   return p[5];
+ +}
+ +
+-+unsigned mem_unlock(int file_desc, unsigned handle)
+++unsigned mbox_mem_unlock(int file_desc, unsigned handle)
+ +{
+ +   int i=0;
+ +   unsigned p[32];
+@@ -11386,117 +14521,30 @@ index 0000000..3904efc
+ +   return p[5];
+ +}
+ +
+-+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
+-+{
+-+   int i=0;
+-+   unsigned p[32];
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+
+-+   p[i++] = 0x30010; // (the tag id)
+-+   p[i++] = 28; // (size of the buffer)
+-+   p[i++] = 28; // (size of the data)
+-+   p[i++] = code;
+-+   p[i++] = r0;
+-+   p[i++] = r1;
+-+   p[i++] = r2;
+-+   p[i++] = r3;
+-+   p[i++] = r4;
+-+   p[i++] = r5;
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+++#define GET_VCIMAGE_PARAMS 0x30044
+ +
+-+unsigned qpu_enable(int file_desc, unsigned enable)
+++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
+ +{
+-+   int i=0;
+-+   unsigned p[32];
+-+
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+
+-+   p[i++] = 0x30012; // (the tag id)
+-+   p[i++] = 4; // (size of the buffer)
+-+   p[i++] = 4; // (size of the data)
+-+   p[i++] = enable;
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+-+
+-+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
+-+   int i=0;
+-+   unsigned p[32];
+-+
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+   p[i++] = 0x30011; // (the tag id)
+-+   p[i++] = 16; // (size of the buffer)
+-+   p[i++] = 16; // (size of the data)
+-+   p[i++] = num_qpus;
+-+   p[i++] = control;
+-+   p[i++] = noflush;
+-+   p[i++] = timeout; // ms
+-+
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+-+
+-+   mbox_property(file_desc, p);
+-+   return p[5];
+-+}
+-+
+-+void execute_multi(int file_desc,
+-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
+-+   int i=0;
+-+   unsigned p[32];
+++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
+++    uint32_t * p = buf;
+++    void * rimg;
+++    int rv;
+ +
+-+   p[i++] = 0; // size
+-+   p[i++] = 0x00000000; // process request
+-+   p[i++] = 0x30018; // (the tag id)
+-+   p[i++] = 88; // (size of the buffer)
+-+   p[i++] = 88; // (size of the data)
+-+
+-+   p[i++] = num_qpus;
+-+   p[i++] = control;
+-+   p[i++] = noflush;
+-+   p[i++] = timeout; // ms
+-+
+-+   p[i++] = num_qpus_2;
+-+   p[i++] = control_2;
+-+   p[i++] = noflush_2;
+-+   p[i++] = timeout_2; // ms
+-+
+-+   p[i++] = code;
+-+   p[i++] = r0;
+-+   p[i++] = r1;
+-+   p[i++] = r2;
+-+   p[i++] = r3;
+-+   p[i++] = r4;
+-+   p[i++] = r5;
+-+
+-+   p[i++] = code_2;
+-+   p[i++] = r0_2;
+-+   p[i++] = r1_2;
+-+   p[i++] = r2_2;
+-+   p[i++] = r3_2;
+-+   p[i++] = r4_2;
+-+   p[i++] = r5_2;
+++    *p++ = 0; // size
+++    *p++ = 0; // process request
+++    *p++ = GET_VCIMAGE_PARAMS;
+++    *p++ = sizeof(*img);
+++    *p++ = sizeof(*img);
+++    rimg = p;
+++    memcpy(p, img, sizeof(*img));
+++    p += sizeof(*img) / sizeof(*p);
+++    *p++ = 0;  // End tag
+++    buf[0] = (p - buf) * sizeof(*p);
+ +
+-+   p[i++] = 0x00000000; // end tag
+-+   p[0] = i*sizeof *p; // actual size
+++    rv = mbox_property(fd, buf);
+++    memcpy(img, rimg, sizeof(*img));
+ +
+-+   mbox_property(file_desc, p);
+-+   return;
+++    return rv;
+ +}
+ +
+ +int mbox_open() {
+@@ -11514,55 +14562,80 @@ index 0000000..3904efc
+ +void mbox_close(int file_desc) {
+ +  close(file_desc);
+ +}
+++
+++#endif
+++
+ diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+ new file mode 100644
+-index 0000000..5898102
++index 0000000..b316878
+ --- /dev/null
+ +++ b/libavcodec/rpi_mailbox.h
+-@@ -0,0 +1,25 @@
++@@ -0,0 +1,58 @@
+ +#ifndef RPI_MAILBOX_H
+ +#define RPI_MAILBOX_H
+ +
+++/* The image structure. */
+++typedef struct vc_image_extra_uv_s {
+++  void *u, *v;
+++  int vpitch;
+++} VC_IMAGE_EXTRA_UV_T;
+++
+++typedef union {
+++    VC_IMAGE_EXTRA_UV_T uv;
+++//  VC_IMAGE_EXTRA_RGBA_T rgba;
+++//  VC_IMAGE_EXTRA_PAL_T pal;
+++//  VC_IMAGE_EXTRA_TF_T tf;
+++//  VC_IMAGE_EXTRA_BAYER_T bayer;
+++//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
+++//  VC_IMAGE_EXTRA_CODEC_T codec;
+++//  VC_IMAGE_EXTRA_OPENGL_T opengl;
+++} VC_IMAGE_EXTRA_T;
+++
+++
+++typedef struct VC_IMAGE_T {
+++  unsigned short                  type;           /* should restrict to 16 bits */
+++  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
+++  unsigned short                  width;          /* width in pixels */
+++  unsigned short                  height;         /* height in pixels */
+++  int                             pitch;          /* pitch of image_data array in bytes */
+++  int                             size;           /* number of bytes available in image_data array */
+++  void                           *image_data;     /* pixel data */
+++  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
+++  void                           *metadata;       /* metadata header for the image */
+++  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
+++  int                             mem_handle;     /* the mem handle for relocatable memory storage */
+++  int                             metadata_size;  /* size of metadata of each channel in bytes */
+++  int                             channel_offset; /* offset of consecutive channels in bytes */
+++  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
+++  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
+++  uint8_t                         current_channel;/* the channel this header is currently pointing to */
+++  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
+++  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
+++                                                            into a linked-mulitchannel image */
+++  uint8_t                         channel_index;         /* index of the channel this header represents while
+++                                                            it is being linked. */
+++  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
+++} VC_IMAGE_T;
+++
+++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+++
+++
+ +extern int mbox_open(void);
+ +extern void mbox_close(int file_desc);
+ +
+-+extern unsigned get_version(int file_desc);
+-+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
+-+extern unsigned mem_free(int file_desc, unsigned handle);
+-+extern unsigned mem_lock(int file_desc, unsigned handle);
+-+extern unsigned mem_unlock(int file_desc, unsigned handle);
+-+extern void *mapmem_shared(unsigned base, unsigned size);
+-+extern void *mapmem_private(unsigned base, unsigned size);
+-+extern void unmapmem(void *addr, unsigned size);
+-+
+-+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
+-+extern void execute_multi(int file_desc,
+-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
+-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
+-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
+-+extern unsigned qpu_enable(int file_desc, unsigned enable);
+++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
+++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+++
+++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+ +
+ +#endif
+ diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+ new file mode 100644
+-index 0000000..365f4a6
++index 0000000..7c0eedd
+ --- /dev/null
+ +++ b/libavcodec/rpi_qpu.c
+-@@ -0,0 +1,993 @@
++@@ -0,0 +1,902 @@
+ +#ifdef RPI
+-+// Use vchiq service for submitting jobs
+-+#define GPUSERVICE
+-+
+-+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
+-+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
+-+//#define RPI_TIME_TOTAL_QPU
+-+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
+-+//#define RPI_TIME_TOTAL_VPU
+-+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
+-+#define RPI_TIME_TOTAL_POSTED
+-+
+ +#include <stdio.h>
+ +#include <stdlib.h>
+ +#include <string.h>
+@@ -11575,27 +14648,35 @@ index 0000000..365f4a6
+ +#include <pthread.h>
+ +#include <time.h>
+ +
+++#include <interface/vcsm/user-vcsm.h>
+++
+ +#include "rpi_mailbox.h"
+ +#include "rpi_qpu.h"
+ +#include "rpi_shader.h"
+ +#include "rpi_hevc_transform.h"
+++#include "rpi_zc.h"
+ +
+-+#include "rpi_user_vcsm.h"
+-+#ifdef GPUSERVICE
+ +#pragma GCC diagnostic push
+ +// Many many redundant decls in the header files
+ +#pragma GCC diagnostic ignored "-Wredundant-decls"
+ +#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+ +#pragma GCC diagnostic pop
+-+#endif
+ +
+-+// QPU profile flags
+-+#define NO_FLUSH 1
+-+#define CLEAR_PROFILE 2
+-+#define OUTPUT_COUNTS 4
+++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
+++#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
+++
+++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
+++// Beware this is expensive and will probably throw off all other timing by >10%
+++#define RPI_TRACE_QPU_PROFILE_ALL       0
+ +
+-+#define FLAGS_FOR_PROFILING (NO_FLUSH)
+++// QPU "noflush" flags
+++// a mixture of flushing & profiling
+ +
+++#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
+++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
+++#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
+++#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
+++#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
+ +
+ +// On Pi2 there is no way to access the VPU L2 cache
+ +// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+@@ -11652,165 +14733,155 @@ index 0000000..365f4a6
+ +{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
+ +};
+ +
+++// Code/constants on GPU
+ +struct GPU
+ +{
+ +  unsigned int qpu_code[QPU_CODE_SIZE];
+ +  unsigned int vpu_code[VPU_CODE_SIZE];
+ +  short transMatrix2even[16*16*2];
+-+  int open_count; // Number of allocated video buffers
+-+  int      mb; // Mailbox handle
+-+  int      vc; // Address in GPU memory
+-+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
+-+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+ +};
+ +
+-+// Stop more than one thread trying to allocate memory or use the processing resources at once
+-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+-+static volatile struct GPU* gpu = NULL;
+-+static GPU_MEM_PTR_T gpu_mem_ptr;
+-+
+-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
+-+static unsigned int Microseconds(void) {
+-+    struct timespec ts;
+-+    unsigned int x;
+-+    static unsigned int base = 0;
+-+    clock_gettime(CLOCK_REALTIME, &ts);
+-+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
+-+    if (base==0) base=x;
+-+    return x-base;
+-+}
+-+#endif
+++#define CFE_ENTS_PER_A 8
+++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
+++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
+++// allow 128
+++#define CFE_ENT_COUNT  128
+++#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
+ +
+-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
+-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
+++struct rpi_cache_flush_env_s {
+++    unsigned int n;
+++    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
+++};
+ +
+-+// Connect to QPU, returns 0 on success.
+-+static int gpu_init(volatile struct GPU **gpu) {
+-+  int mb = mbox_open();
+-+  int vc;
+-+  volatile struct GPU* ptr;
+-+	if (mb < 0)
+-+		return -1;
+-+#ifndef RPI_ASYNC
+-+	if (qpu_enable(mb, 1)) return -2;
+-+#endif
+-+  vcsm_init();
+-+  vc_gpuserv_init();
+-+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
+-+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
+-+  memset((void*)ptr, 0, sizeof *ptr);
+-+  vc = gpu_mem_ptr.vc;
+++#define WAIT_COUNT_MAX 16
+ +
+-+  ptr->mb = mb;
+-+  ptr->vc = vc;
+++typedef struct trace_time_one_s
+++{
+++  int count;
+++  int64_t start[WAIT_COUNT_MAX];
+++  int64_t total[WAIT_COUNT_MAX];
+++} trace_time_one_t;
+ +
+-+  printf("GPU allocated at 0x%x\n",vc);
+++typedef struct trace_time_wait_s
+++{
+++  unsigned int jcount;
+++  int64_t start0;
+++  int64_t last_update;
+++  trace_time_one_t active;
+++  trace_time_one_t wait;
+++} trace_time_wait_t;
+++
+++typedef struct vq_wait_s
+++{
+++  sem_t sem;
+++  unsigned int cost;
+++  struct vq_wait_s * next;
+++} vq_wait_t;
+ +
+-+  *gpu = ptr;
+++#define VQ_WAIT_POOL_SIZE 16
+++typedef struct vq_wait_pool_s
+++{
+++  vq_wait_t * head;
+++  vq_wait_t pool[VQ_WAIT_POOL_SIZE];
+++} vq_wait_pool_t;
+ +
+-+  // Now copy over the QPU code into GPU memory
+-+  {
+-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
+-+    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+-+  }
+-+  // And the VPU code
+-+  {
+-+    int num_bytes = sizeof(rpi_hevc_transform);
+-+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+-+  }
+-+  // And the transform coefficients
+-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
+++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
+ +
+-+#ifdef RPI_ASYNC
+-+  {
+-+    int err;
+-+    vpu_async_tail = 0;
+-+    vpu_async_head = 0;
+-+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
+-+    //printf("Created thread\n");
+-+    if (err) {
+-+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
+-+        return -4;
+-+    }
+++typedef struct gpu_env_s
+++{
+++  int open_count;
+++  int init_count;
+++  int mb;
+++  unsigned int current_load;
+++  GPU_MEM_PTR_T code_gm_ptr;
+++  vq_wait_pool_t wait_pool;
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++  trace_time_wait_t ttw;
+++#endif
+++} gpu_env_t;
+ +
+-+    {
+-+      struct sched_param param = {0};
+-+      int policy = 0;
+++// Stop more than one thread trying to allocate memory or use the processing resources at once
+++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
+++static gpu_env_t * gpu = NULL;
+ +
+-+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+-+      {
+-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+-+      }
+-+      else
+-+      {
+-+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
+-+            policy,
+-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+            param.sched_priority);
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ +
+-+        policy = SCHED_FIFO;
+-+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
+++static int64_t ns_time(void)
+++{
+++    struct timespec ts;
+++    clock_gettime(CLOCK_MONOTONIC, &ts);
+++    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
+++}
+ +
+-+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
+-+            policy,
+-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+            param.sched_priority);
+ +
+-+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
+-+        {
+-+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
+-+        }
+-+        else
+-+        {
+-+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
+-+          {
+-+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
+-+          }
+-+          else
+-+          {
+-+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
+-+                policy,
+-+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
+-+                param.sched_priority);
+-+          }
+-+        }
+-+      }
+++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
+ +
+-+    }
+++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
+++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
+++#define T_ARG(t) T_SEC(t), T_MS(t)
+++#define T_FMT "%u.%03u"
+ +
+++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
+++{
+++  // Update totals for levels that are still pending
+++  for (int i = 0; i < tto->count; ++i) {
+++    tto->total[i] += now - tto->start[i];
+++    tto->start[i] = now;
+ +  }
+-+#endif
+ +
+-+  return 0;
+++  printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
+++         prefix,
+++         T_ARG(now - start0 - tto->total[0]),
+++         T_ARG(tto->total[0]),
+++         T_ARG(tto->total[1]),
+++         T_ARG(tto->total[2]),
+++         T_ARG(tto->total[3]));
+ +}
+ +
+-+// Returns 1 if the gpu is currently idle
+-+static int gpu_idle(void)
+++
+++static void tto_start(trace_time_one_t * const tto, const int64_t now)
+ +{
+-+  int ret = pthread_mutex_trylock(&gpu_mutex);
+-+  if (ret==0) {
+-+    pthread_mutex_unlock(&gpu_mutex);
+-+    return 1;
+-+  }
+-+  return 0;
+++  av_assert0(tto->count < WAIT_COUNT_MAX);
+++  tto->start[tto->count++] = now;
+ +}
+ +
+-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+-+static void gpu_lock(void) {
+-+  pthread_mutex_lock(&gpu_mutex);
+++static void tto_end(trace_time_one_t * const tto, const int64_t now)
+++{
+++  const int n = --tto->count;
+++  av_assert0(n >= 0);
+++  tto->total[n] += now - tto->start[n];
+++}
+ +
+-+  if (gpu==NULL) {
+-+    gpu_init(&gpu);
+-+  }
+++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
+++{
+++  printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
+++  tto_print(&ttw->active, now, ttw->start0, "Active");
+++  tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
+ +}
+ +
+-+static void gpu_unlock(void) {
+-+  pthread_mutex_unlock(&gpu_mutex);
+++#endif
+++
+++// GPU memory alloc fns (internal)
+++
+++// GPU_MEM_PTR_T alloc fns
+++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+++  p->numbytes = numbytes;
+++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+++  av_assert0(p->vcsm_handle);
+++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+++  av_assert0(p->vc_handle);
+++  p->arm = vcsm_lock(p->vcsm_handle);
+++  av_assert0(p->arm);
+++  p->vc = mbox_mem_lock(mb, p->vc_handle);
+++  av_assert0(p->vc);
+++  return 0;
+ +}
+ +
+-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
+++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+ +  p->numbytes = numbytes;
+ +  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+ +  av_assert0(p->vcsm_handle);
+@@ -11818,90 +14889,143 @@ index 0000000..365f4a6
+ +  av_assert0(p->vc_handle);
+ +  p->arm = vcsm_lock(p->vcsm_handle);
+ +  av_assert0(p->arm);
+-+  p->vc = mem_lock(mb, p->vc_handle);
+++  p->vc = mbox_mem_lock(mb, p->vc_handle);
+ +  av_assert0(p->vc);
+ +  return 0;
+ +}
+ +
+-+// Allocate memory on GPU
+-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+-+// Returns 0 on success.
+-+// This allocates memory that will not be cached in ARM's data cache.
+-+// Therefore safe to use without data cache flushing.
+-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+-+{
+-+  int r;
+-+  gpu_lock();
+-+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
+-+  gpu->open_count++;
+-+  gpu_unlock();
+-+  return r;
+++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
+++  mbox_mem_unlock(mb, p->vc_handle);
+++  vcsm_unlock_ptr(p->arm);
+++  vcsm_free(p->vcsm_handle);
+++  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
+ +}
+ +
+-+int gpu_get_mailbox(void)
+++
+++// GPU init, free, lock, unlock
+++
+++static void gpu_term(void)
+ +{
+-+  av_assert0(gpu);
+-+  return gpu->mb;
+++  gpu_env_t * const ge = gpu;
+++
+++  // We have to hope that eveything has terminated...
+++  gpu = NULL;
+++
+++  vc_gpuserv_deinit();
+++
+++  gpu_free_internal(ge->mb, &ge->code_gm_ptr);
+++
+++  vcsm_exit();
+++
+++  mbox_close(ge->mb);
+++
+++  vq_wait_pool_deinit(&ge->wait_pool);
+++
+++  free(ge);
+ +}
+ +
+-+// Call this to clean and invalidate a region of memory
+-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
+++
+++// Connect to QPU, returns 0 on success.
+++static int gpu_init(gpu_env_t ** const gpu) {
+++  volatile struct GPU* ptr;
+++  gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
+++  *gpu = NULL;
+++
+++  if (ge == NULL)
+++    return -1;
+++
+++  if ((ge->mb = mbox_open()) < 0)
+++    return -1;
+++
+++  vq_wait_pool_init(&ge->wait_pool);
+++
+++  vcsm_init();
+++
+++  gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
+++  ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
+++
+++  // Zero everything so we have zeros between the code bits
+++  memset((void *)ptr, 0, sizeof(*ptr));
+++
+++  // Now copy over the QPU code into GPU memory
+++  {
+++    int num_bytes = (char *)mc_end - (char *)rpi_shader;
+++    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+++  }
+++  // And the VPU code
+++  {
+++    int num_bytes = sizeof(rpi_hevc_transform);
+++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+++  }
+++  // And the transform coefficients
+++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+++
+++  *gpu = ge;
+++  return 0;
+++}
+++
+++
+++
+++static void gpu_unlock(void) {
+++  pthread_mutex_unlock(&gpu_mutex);
+++}
+++
+++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
+++static gpu_env_t * gpu_lock(void) {
+++  pthread_mutex_lock(&gpu_mutex);
+++
+++  av_assert0(gpu != NULL);
+++  return gpu;
+++}
+++
+++static gpu_env_t * gpu_lock_ref(void)
+ +{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    iocache.s[0].handle = p->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int) p->arm;
+-+    iocache.s[0].size  = p->numbytes;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    void *tmp = vcsm_lock(p->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+#endif
+++  pthread_mutex_lock(&gpu_mutex);
+++
+++  if (gpu == NULL) {
+++    int rv = gpu_init(&gpu);
+++    if (rv != 0) {
+++      gpu_unlock();
+++      return NULL;
+++    }
+++  }
+++
+++  ++gpu->open_count;
+++  return gpu;
+ +}
+ +
+-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
+++static void gpu_unlock_unref(gpu_env_t * const ge)
+ +{
+-+#ifdef RPI_FAST_CACHEFLUSH
+-+    struct vcsm_user_clean_invalid_s iocache = {};
+-+    iocache.s[0].handle = p0->vcsm_handle;
+-+    iocache.s[0].cmd = 3; // clean+invalidate
+-+    iocache.s[0].addr = (int) p0->arm;
+-+    iocache.s[0].size  = p0->numbytes;
+-+    iocache.s[1].handle = p1->vcsm_handle;
+-+    iocache.s[1].cmd = 3; // clean+invalidate
+-+    iocache.s[1].addr = (int) p1->arm;
+-+    iocache.s[1].size  = p1->numbytes;
+-+    iocache.s[2].handle = p2->vcsm_handle;
+-+    iocache.s[2].cmd = 3; // clean+invalidate
+-+    iocache.s[2].addr = (int) p2->arm;
+-+    iocache.s[2].size  = p2->numbytes;
+-+    vcsm_clean_invalid( &iocache );
+-+#else
+-+    void *tmp;
+-+    tmp = vcsm_lock(p0->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+    tmp = vcsm_lock(p1->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+    tmp = vcsm_lock(p2->vcsm_handle);
+-+    vcsm_unlock_ptr(tmp);
+-+#endif
+++  if (--ge->open_count == 0)
+++    gpu_term();
+++
+++  gpu_unlock();
+++}
+++
+++static inline gpu_env_t * gpu_ptr(void)
+++{
+++  av_assert0(gpu != NULL);
+++  return gpu;
+ +}
+ +
+-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
+-+  p->numbytes = numbytes;
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+-+  av_assert0(p->vcsm_handle);
+-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+-+  av_assert0(p->vc_handle);
+-+  p->arm = vcsm_lock(p->vcsm_handle);
+-+  av_assert0(p->arm);
+-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
+-+  av_assert0(p->vc);
+-+  return 0;
+++// Public gpu fns
+++
+++// Allocate memory on GPU
+++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
+++// Returns 0 on success.
+++// This allocates memory that will not be cached in ARM's data cache.
+++// Therefore safe to use without data cache flushing.
+++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+++{
+++  int r;
+++  gpu_env_t * const ge = gpu_lock_ref();
+++  if (ge == NULL)
+++    return -1;
+++  r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
+++  gpu_unlock();
+++  return r;
+ +}
+ +
+ +// This allocates data that will be
+@@ -11910,653 +15034,518 @@ index 0000000..365f4a6
+ +int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+ +{
+ +  int r;
+-+  gpu_lock();
+-+  r = gpu_malloc_cached_internal(numbytes, p);
+-+  gpu->open_count++;
+++  gpu_env_t * const ge = gpu_lock_ref();
+++  if (ge == NULL)
+++    return -1;
+++  r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
+ +  gpu_unlock();
+ +  return r;
+ +}
+ +
+-+static void gpu_term(void)
+-+{
+-+  int mb;
+-+
+-+  if (gpu==NULL)
+-+    return;
+-+  mb = gpu->mb;
+-+
+-+  // ??? Tear down anything needed for gpuexecute
+-+
+-+  qpu_enable(mb, 0);
+-+  gpu_free_internal(&gpu_mem_ptr);
+-+
+-+  vc_gpuserv_deinit();
+-+  vcsm_exit();
+-+
+-+  mbox_close(mb);
+-+  gpu = NULL;
+++void gpu_free(GPU_MEM_PTR_T * const p) {
+++  gpu_env_t * const ge = gpu_lock();
+++  gpu_free_internal(ge->mb, p);
+++  gpu_unlock_unref(ge);
+ +}
+ +
+-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
+-+  int mb = gpu->mb;
+-+  mem_unlock(mb,p->vc_handle);
+-+  vcsm_unlock_ptr(p->arm);
+-+  vcsm_free(p->vcsm_handle);
+++unsigned int vpu_get_fn(void) {
+++  // Make sure that the gpu is initialized
+++  av_assert0(gpu != NULL);
+++  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
+ +}
+ +
+-+void gpu_free(GPU_MEM_PTR_T *p) {
+-+  gpu_lock();
+-+
+-+  gpu_free_internal(p);
+-+
+-+  gpu->open_count--;
+-+  if (gpu->open_count==0) {
+-+      printf("Closing GPU\n");
+-+      gpu_term();
+-+      gpu = NULL;
+-+  }
+-+  gpu_unlock();
+++unsigned int vpu_get_constants(void) {
+++  av_assert0(gpu != NULL);
+++  return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
+ +}
+ +
+-+unsigned int vpu_get_fn(void) {
+-+  // Make sure that the gpu is initialized
+-+  if (gpu==NULL) {
+-+    printf("Preparing gpu\n");
+-+    gpu_lock();
+-+    gpu_unlock();
+-+  }
+-+  return gpu->vc + offsetof(struct GPU,vpu_code);
+++int gpu_get_mailbox(void)
+++{
+++  av_assert0(gpu);
+++  return gpu->mb;
+ +}
+ +
+-+unsigned int vpu_get_constants(void) {
+-+  if (gpu==NULL) {
+-+    gpu_lock();
+-+    gpu_unlock();
+-+  }
+-+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
+++void gpu_ref(void)
+++{
+++  gpu_lock_ref();
+++  gpu_unlock();
+ +}
+ +
+-+#ifdef GPUSERVICE
+-+static void callback(void *cookie)
+++void gpu_unref(void)
+ +{
+-+  sem_post((sem_t *)cookie);
+++  gpu_env_t * const ge = gpu_lock();
+++  gpu_unlock_unref(ge);
+ +}
+-+#endif
+ +
+++// ----------------------------------------------------------------------------
+++//
+++// Cache flush functions
+ +
+-+static volatile uint32_t post_done = 0;
+-+static volatile uint32_t post_qed = 0;
+ +
+-+static void post_code2_cb(void * v)
+++rpi_cache_flush_env_t * rpi_cache_flush_init()
+ +{
+-+  uint32_t n = (uint32_t)v;
+-+  if ((int32_t)(n - post_done) > 0) {
+-+    post_done = n;
+-+  }
+++    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
+++    if (rfe == NULL)
+++        return NULL;
+++
+++    rfe->n = 0;
+++    return rfe;
+ +}
+ +
+++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
+++{
+++    if (rfe != NULL)
+++        free(rfe);
+++}
+ +
+-+// Post a command to the queue
+-+// Returns an id which we can use to wait for completion
+-+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
+++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
+ +{
+-+  struct gpu_job_s j[1] = {
+++    int rc = 0;
+++    unsigned int na;
+++    unsigned int nr;
+++
+++    // Clear any reamaining ents in the final block
+++    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
+++        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
+++
+++    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
+ +    {
+-+      .command = EXECUTE_VPU,
+-+      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
+-+      .callback.func = post_code2_cb
+++        if (vcsm_clean_invalid(rfe->a + na) != 0)
+++            rc = -1;
+ +    }
+-+  };
+-+  uint32_t id;
+ +
+-+  j[0].callback.cookie = (void *)(id = ++post_qed);
+++    free(rfe);
+ +
+-+  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+++    if (rc == 0)
+++        return 0;
+ +
+-+  return id;
+++    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
+++    return rc;
+ +}
+ +
+-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+    int qpu0_n, const uint32_t * qpu0_mail,
+-+    int qpu1_n, const uint32_t * qpu1_mail)
+++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+ +{
+-+#if 1
+-+  sem_t sync0;
+-+  struct gpu_job_s j[4];
+-+
+-+  sem_init(&sync0, 0, 0);
+-+
+-+  j[0].command = EXECUTE_VPU;
+-+  j[0].u.v.q[0] = vpu_code;
+-+  j[0].u.v.q[1] = r0;
+-+  j[0].u.v.q[2] = r1;
+-+  j[0].u.v.q[3] = r2;
+-+  j[0].u.v.q[4] = r3;
+-+  j[0].u.v.q[5] = r4;
+-+  j[0].u.v.q[6] = r5;
+-+  j[0].callback.func = 0;
+-+  j[0].callback.cookie = NULL;
+-+
+-+  j[1].command = EXECUTE_QPU;
+-+  j[1].u.q.jobs = qpu1_n;
+-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+-+  j[1].u.q.timeout = 5000;
+-+  j[1].callback.func = 0;
+-+  j[1].callback.cookie = NULL;
+-+
+-+  j[2].command = EXECUTE_QPU;
+-+  j[2].u.q.jobs = qpu0_n;
+-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+-+  j[2].u.q.noflush = 1;
+-+  j[2].u.q.timeout = 5000;
+-+  j[2].callback.func = 0;
+-+  j[2].callback.cookie = NULL;
+-+
+-+  j[3].command = EXECUTE_SYNC;
+-+  j[3].u.s.mask = 3;
+-+  j[3].callback.func = callback;
+-+  j[3].callback.cookie = (void *)&sync0;
+-+
+-+  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
+-+
+-+  sem_wait(&sync0);
+-+#else
+++    // Deal with empty pointer trivially
+++    if (gm == NULL || gm->numbytes == 0)
+++        return;
+ +
+-+  sem_t sync0, sync2;
+-+  struct gpu_job_s j[3];
+-+
+-+  sem_init(&sync0, 0, 0);
+-+  sem_init(&sync2, 0, 0);
+-+
+-+  j[0].command = EXECUTE_VPU;
+-+  j[0].u.v.q[0] = vpu_code;
+-+  j[0].u.v.q[1] = r0;
+-+  j[0].u.v.q[2] = r1;
+-+  j[0].u.v.q[3] = r2;
+-+  j[0].u.v.q[4] = r3;
+-+  j[0].u.v.q[5] = r4;
+-+  j[0].u.v.q[6] = r5;
+-+  j[0].callback.func = callback;
+-+  j[0].callback.cookie = (void *)&sync0;
+-+
+-+  j[1].command = EXECUTE_QPU;
+-+  j[1].u.q.jobs = qpu1_n;
+-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
+-+  j[1].u.q.timeout = 5000;
+-+  j[1].callback.func = 0;
+-+  j[1].callback.cookie = NULL;
+-+
+-+  j[2].command = EXECUTE_QPU;
+-+  j[2].u.q.jobs = qpu0_n;
+-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+-+  j[2].u.q.noflush = 1;
+-+  j[2].u.q.timeout = 5000;
+-+  j[2].callback.func = callback;
+-+  j[2].callback.cookie = (void *)&sync2;
+-+
+-+  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
+-+
+-+  sem_wait(&sync0);
+-+  sem_wait(&sync2);
+-+#endif
+++    {
+++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
+++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
+ +
+-+  return 0;
+-+}
+++        av_assert0(rfe->n < CFE_ENT_COUNT);
+ +
+++        a->s[n].cmd = mode;
+++        a->s[n].handle = gm->vcsm_handle;
+++        a->s[n].addr = (unsigned int)gm->arm;
+++        a->s[n].size = gm->numbytes;
+++        ++rfe->n;
+++    }
+++}
+ +
+-+// Wait for completion of the given command
+-+void vpu_wait(int id)
+++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+++  const unsigned int offset, const unsigned int size)
+ +{
+-+  if (id == 0) {
+-+#if 0
+-+    sem_t sync0;
+-+    struct gpu_job_s j[1] =
+++    // Deal with empty pointer trivially
+++    if (gm == NULL || size == 0)
+++        return;
+++
+++//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
+++
+++    av_assert0(offset <= gm->numbytes);
+++    av_assert0(size <= gm->numbytes);
+++    av_assert0(offset + size <= gm->numbytes);
+++
+ +    {
+-+      {
+-+        .command = EXECUTE_SYNC,
+-+        .u.s.mask = 3,
+-+        .callback.func = callback,
+-+        .callback.cookie = (void *)&sync0
+-+      }
+-+    };
+++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
+++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
+ +
+-+    sem_init(&sync0, 0, 0);
+++        av_assert0(rfe->n < CFE_ENT_COUNT);
+ +
+-+    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
+++        a->s[n].cmd = mode;
+++        a->s[n].handle = gm->vcsm_handle;
+++        a->s[n].addr = (unsigned int)gm->arm + offset;
+++        a->s[n].size = size;
+++        ++rfe->n;
+++    }
+++}
+ +
+-+    sem_wait(&sync0);
+++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
+++{
+++#if !RPI_ONE_BUF
+++#error Fixme! (NIF)
+ +#endif
+++  if (gpu_is_buf1(frame)) {
+++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
+ +  }
+-+  else {
+-+    while ((int32_t)(post_done - (uint32_t)id) < 0) {
+-+      usleep(1000);
+-+    }
+++  else
+++  {
+++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
+++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
+++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
+ +  }
+ +}
+ +
+-+
+-+unsigned int qpu_get_fn(int num) {
+-+    // Make sure that the gpu is initialized
+-+    unsigned int *fn;
+-+    if (gpu==NULL) {
+-+      printf("Preparing gpu\n");
+-+      gpu_lock();
+-+      gpu_unlock();
+++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
+++{
+++  const unsigned int y_offset = frame->linesize[0] * start_line;
+++  const unsigned int y_size = frame->linesize[0] * n;
+++  // Round UV up/down to get everything
+++  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
+++  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
+++  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
+++
+++  // As all unsigned they will also reject -ve
+++  // Test individually as well as added to reject overflow
+++  av_assert0(start_line <= (unsigned int)frame->height);
+++  av_assert0(n <= (unsigned int)frame->height);
+++  av_assert0(start_line + n <= (unsigned int)frame->height);
+++
+++  if (!gpu_is_buf1(frame))
+++  {
+++    if (do_luma) {
+++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
+ +    }
+-+    switch(num) {
+-+    case QPU_MC_SETUP:
+-+      fn = mc_setup;
+-+      break;
+-+    case QPU_MC_FILTER:
+-+      fn = mc_filter;
+-+      break;
+-+    case QPU_MC_EXIT:
+-+      fn = mc_exit;
+-+      break;
+-+    case QPU_MC_INTERRUPT_EXIT12:
+-+      fn = mc_interrupt_exit12;
+-+      break;
+-+    case QPU_MC_FILTER_B:
+-+      fn = mc_filter_b;
+-+      break;
+-+    //case QPU_MC_FILTER_HONLY:
+-+    //  fn = mc_filter_honly;
+-+    //  break;
+-+    case QPU_MC_SETUP_UV:
+-+      fn = mc_setup_uv;
+-+      break;
+-+    case QPU_MC_FILTER_UV:
+-+      fn = mc_filter_uv;
+-+      break;
+-+    case QPU_MC_FILTER_UV_B0:
+-+      fn = mc_filter_uv_b0;
+-+      break;
+-+    case QPU_MC_FILTER_UV_B:
+-+      fn = mc_filter_uv_b;
+-+      break;
+-+    case QPU_MC_INTERRUPT_EXIT8:
+-+      fn = mc_interrupt_exit8;
+-+      break;
+-+    case QPU_MC_END:
+-+      fn = mc_end;
+-+      break;
+-+    default:
+-+      printf("Unknown function\n");
+-+      exit(-1);
+++    if (do_chroma) {
+++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
+++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
+++    }
+++  }
+++  else if (!rpi_sliced_frame(frame))
+++  {
+++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+++    if (do_luma) {
+++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
+++    }
+++    if (do_chroma) {
+++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
+++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
+++    }
+++  }
+++  else
+++  {
+++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+++//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
+++    for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+++      if (do_luma) {
+++        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size);
+++      }
+++      if (do_chroma) {
+++        rpi_cache_flush_add_gm_range(rfe, gm, mode,
+++                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size);
+++      }
+ +    }
+-+    return gpu->vc + 4*(int)(fn-rpi_shader);
+-+    //return code[num] + gpu->vc;
+++  }
+ +}
+ +
+-+#if 0
+-+typedef unsigned int uint32_t;
+-+
+-+typedef struct mvs_s {
+-+    GPU_MEM_PTR_T unif_mvs_ptr;
+-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
+++// Call this to clean and invalidate a region of memory
+++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
+++{
+++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+++  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
+++  rpi_cache_flush_finish(rfe);
+++}
+ +
+-+    // _base pointers are to the start of the row
+-+    uint32_t *mvs_base[8];
+-+    // these pointers are to the next free space
+-+    uint32_t *u_mvs[8];
+ +
+-+} HEVCContext;
+++// ----------------------------------------------------------------------------
+ +
+-+#define RPI_CHROMA_COMMAND_WORDS 12
+ +
+-+static void rpi_inter_clear(HEVCContext *s)
+++// Wait abstractions - mostly so we can easily add profile code
+++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
+ +{
+-+    int i;
+-+    for(i=0;i<8;i++) {
+-+        s->u_mvs[i] = s->mvs_base[i];
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 0;
+-+        *s->u_mvs[i]++ = 128;  // w
+-+        *s->u_mvs[i]++ = 128;  // h
+-+        *s->u_mvs[i]++ = 128;  // stride u
+-+        *s->u_mvs[i]++ = 128;  // stride v
+-+        s->u_mvs[i] += 3;  // Padding words
+-+    }
+++  unsigned int i;
+++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+++    sem_init(&wp->pool[i].sem, 0, 0);
+++    wp->pool[i].next = wp->pool + i + 1;
+++  }
+++  wp->head = wp->pool + 0;
+++  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
+ +}
+ +
+-+static void rpi_execute_inter_qpu(HEVCContext *s)
+++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
+ +{
+-+    int k;
+-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
+++  unsigned int i;
+++  wp->head = NULL;
+++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
+++    sem_destroy(&wp->pool[i].sem);
+++    wp->pool[i].next = NULL;
+++  }
+++}
+ +
+-+    for(k=0;k<8;k++) {
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
+-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
+-+    }
+ +
+-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
+-+
+-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
+-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
+-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
+-+      );
+++// If sem_init actually takes time then maybe we want a pool...
+++static vq_wait_t * vq_wait_new(const unsigned int cost)
+++{
+++  gpu_env_t * const ge = gpu_lock_ref();
+++  vq_wait_t * const wait = ge->wait_pool.head;
+++  ge->wait_pool.head = wait->next;
+++  ge->current_load += cost;
+++  wait->cost = cost;
+++  wait->next = NULL;
+++
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++  tto_start(&ge->ttw.active, ns_time());
+++#endif
+++
+++  gpu_unlock();
+++  return wait;
+ +}
+ +
+-+void rpi_test_qpu(void)
+++static void vq_wait_delete(vq_wait_t * const wait)
+ +{
+-+    HEVCContext mvs;
+-+    HEVCContext *s = &mvs;
+-+    int i;
+-+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
+-+    uint32_t *p;
+-+    printf("Allocate memory\n");
+-+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
+-+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
+-+
+-+    // Set up initial locations for uniform streams
+-+    p = s->unif_mvs;
+-+    for(i = 0; i < 8; i++) {
+-+        s->mvs_base[i] = p;
+-+        p += uv_commands_per_qpu;
+++  gpu_env_t * const ge = gpu_lock();
+++  wait->next = ge->wait_pool.head;
+++  ge->wait_pool.head = wait;
+++
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++  {
+++    trace_time_wait_t * const ttw = &ge->ttw;
+++    const int64_t now = ns_time();
+++    ++ttw->jcount;
+++    tto_end(&ttw->wait, now);
+++
+++    if (ttw->start0 == 0)
+++    {
+++      ttw->start0 = ttw->active.start[0];
+++      ttw->last_update = ttw->start0;
+ +    }
+-+    // Now run a simple program that should just quit immediately after a single texture fetch
+-+    rpi_inter_clear(s);
+-+    for(i=0;i<4;i++) {
+-+      printf("Launch QPUs\n");
+-+      rpi_execute_inter_qpu(s);
+-+      printf("Done\n");
+++    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
+++    {
+++      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
+++      ttw_print(ttw, now);
+ +    }
+-+    printf("Free memory\n");
+-+    gpu_free(&s->unif_mvs_ptr);
+-+    return;
+-+}
+++  }
+ +#endif
+++  gpu_unlock_unref(ge);
+++}
+ +
+-+#if 0
+-+
+-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
+-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
+-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
+-+
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
+-+
+-+static uint8_t av_clip_uint8(int32_t a)
+++static void vq_wait_wait(vq_wait_t * const wait)
+ +{
+-+    if (a&(~255)) return (-a)>>31;
+-+    else          return a;
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++  {
+++      const int64_t now = ns_time();
+++      gpu_env_t * const ge = gpu_lock();
+++      tto_start(&ge->ttw.wait, now);
+++      gpu_unlock();
+++  }
+++#endif
+++
+++  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
+++    /* loop */;
+ +}
+ +
+-+static int32_t filter8(const uint8_t *data, int pitch)
+++static void vq_wait_post(vq_wait_t * const wait)
+ +{
+-+   int32_t vsum = 0;
+-+   int x, y;
+++#if !RPI_TRACE_TIME_VPU_QPU_WAIT
+++  if (wait->cost != 0)
+++#endif
+++  {
+++    gpu_env_t *const ge = gpu_lock();
+++    ge->current_load -= wait->cost;
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++    tto_end(&ge->ttw.active, ns_time());
+++#endif
+++    gpu_unlock();
+++  }
+ +
+-+   for (y = 0; y < 8; y++) {
+-+      int32_t hsum = 0;
+++  sem_post(&wait->sem);
+++}
+ +
+-+      for (x = 0; x < 8; x++)
+-+         hsum += hcoeffs[x]*data[x + y * pitch];
+ +
+-+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
+-+   }
+ +
+-+   return av_clip_uint8( (vsum + 64) >> 7);
+-+}
+++// Header comments were wrong for these two
+++#define VPU_QPU_MASK_QPU  1
+++#define VPU_QPU_MASK_VPU  2
+ +
+-+// Note regression changes coefficients so is not thread safe
+-+//#define REGRESSION
+-+#ifdef REGRESSION
+-+#define CMAX 100
+-+#else
+-+#define CMAX 2
+-+#endif
+-+#define YMAX 16
+++#define VPU_QPU_JOB_MAX 4
+++struct vpu_qpu_job_env_s
+++{
+++  unsigned int n;
+++  unsigned int mask;
+++  unsigned int cost;
+++  struct gpu_job_s j[VPU_QPU_JOB_MAX];
+++};
+ +
+-+int rpi_test_shader(void)
+++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
+++
+++vpu_qpu_job_env_t * vpu_qpu_job_new(void)
+ +{
+-+   int i, c;
+++  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
+++  return vqj;
+++}
+ +
+-+   uint32_t *unifs;
+++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
+++{
+++  memset(vqj, 0, sizeof(*vqj));
+++  free(vqj);
+++}
+ +
+-+   uint8_t *in_buffer;
+-+   uint8_t *out_buffer[2];
+++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
+++{
+++  struct gpu_job_s * const j = vqj->j + vqj->n++;
+++  av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
+++  return j;
+++}
+ +
+-+   GPU_MEM_PTR_T unifs_ptr;
+-+   GPU_MEM_PTR_T in_buffer_ptr;
+-+   GPU_MEM_PTR_T out_buffer_ptr[2];
+++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
+++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
+++{
+++  if (vpu_code != 0) {
+++    struct gpu_job_s *const j = new_job(vqj);
+++    vqj->mask |= VPU_QPU_MASK_VPU;
+++
+++    j->command = EXECUTE_VPU;
+++    j->u.v.q[0] = vpu_code;
+++    j->u.v.q[1] = r0;
+++    j->u.v.q[2] = r1;
+++    j->u.v.q[3] = r2;
+++    j->u.v.q[4] = r3;
+++    j->u.v.q[5] = r4;
+++    j->u.v.q[6] = r5;
+++  }
+++}
+ +
+-+   // Addresses in GPU memory of filter programs
+-+   uint32_t mc_setup = 0;
+-+   uint32_t mc_filter = 0;
+-+   uint32_t mc_exit = 0;
+++// flags are QPU_FLAGS_xxx
+++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
+++{
+++  if (n != 0) {
+++    struct gpu_job_s *const j = new_job(vqj);
+++    vqj->mask |= VPU_QPU_MASK_QPU;
+++    vqj->cost += cost;
+++
+++    j->command = EXECUTE_QPU;
+++    j->u.q.jobs = n;
+++#if RPI_TRACE_QPU_PROFILE_ALL
+++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
+++#else
+++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
+++#endif
+++    j->u.q.timeout = 5000;
+++    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
+++  }
+++}
+ +
+-+   int pitch = 0x500;
+++// Convert callback to sem post
+++static void vpu_qpu_job_callback_wait(void * v)
+++{
+++  vq_wait_post(v);
+++}
+ +
+-+   if (gpu==NULL) {
+-+      gpu_lock();
+-+      gpu_unlock();
+-+   }
+++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
+++{
+++  vq_wait_t * wait;
+ +
+-+   printf("This needs to change to reflect new assembler\n");
+-+   // Use table to compute locations of program start points
+-+   mc_setup = code[0] + gpu->vc;
+-+   mc_filter = code[1] + gpu->vc;
+-+   mc_exit = code[2] + gpu->vc;
+++  if (vqj->mask == 0) {
+++    *wait_h = NULL;
+++    return;
+++  }
+ +
+-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+-+      return -2;
+-+   }
+-+   unifs = (uint32_t*)unifs_ptr.arm;
+++  // We are going to want a sync object
+++  wait = vq_wait_new(vqj->cost);
+ +
+-+   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
+-+      return -3;
+-+   }
+-+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
+++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
+++  // If we only posted one thing or only QPU jobs
+++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
+++  {
+++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
+++    av_assert0(j->callback.func == 0);
+ +
+-+   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
+-+      return -4;
+-+   }
+-+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
+-+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
+-+
+-+   for (c = 0; c < CMAX; c++) {
+-+      int xo[] = {rand()&31, rand()&31};
+-+
+-+#ifdef REGRESSION
+-+      for (i = 0; i < 8; i++) {
+-+         hcoeffs[i] = (int8_t)rand();
+-+         vcoeffs[i] = (int8_t)rand();
+-+         if (hcoeffs[i]==-128)
+-+           hcoeffs[i]++;
+-+         if (vcoeffs[i]==-128)
+-+           vcoeffs[i]++;
+-+      }
+-+#endif
+++    j->callback.func = vpu_qpu_job_callback_wait;
+++    j->callback.cookie = wait;
+++  }
+++  else
+++  {
+++    struct gpu_job_s *const j = new_job(vqj);
+ +
+-+      for (i = 0; i < 64*23; i++) {
+-+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
+-+         in_buffer[i] = rand();
+-+      }
+++    j->command = EXECUTE_SYNC;
+++    j->u.s.mask = vqj->mask;
+++    j->callback.func = vpu_qpu_job_callback_wait;
+++    j->callback.cookie = wait;
+++  }
+ +
+-+      // Clear output array
+-+      {
+-+        int b;
+-+        for(b=0;b<2;b++) {
+-+          for(i=0;i<16*16;i++) {
+-+            out_buffer[b][i] = 3;
+-+          }
+-+        }
+-+      }
+++  vqj->cost = 0;
+++  vqj->mask = 0;
+++  *wait_h = wait;
+++}
+ +
+-+      unifs[0] = mc_filter;
+-+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
+-+      unifs[2] = 64; // src pitch
+-+      unifs[3] = pitch; // dst pitch
+-+      unifs[4] = 0; // Padding
+-+      unifs[5] = 0;
+-+      unifs[6] = 0;
+-+      unifs[7 ] = mc_filter;
+-+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
+-+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+      unifs[13] = out_buffer_ptr[0].vc;
+-+      unifs[14] = mc_exit;
+-+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
+-+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+      unifs[20] = out_buffer_ptr[1].vc;
+-+
+-+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+-+
+-+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
+-+
+-+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
+-+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
+-+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
+-+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
+-+
+-+      if (1)
+-+      {
+-+         int x, y, b;
+-+         int bad = 0;
+-+
+-+         for (b=0; b<2; ++b)
+-+            for (y=0; y<YMAX; ++y)
+-+               for (x=0; x<16; ++x) {
+-+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
+-+
+-+                  if (out_buffer[b][x+y*pitch] != ref) {
+-+                      bad = 1;
+-+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
+-+                  }
+-+#ifndef REGRESSION
+-+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
+-+#endif
+-+               }
+-+          if (bad)
+-+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+-+          else
+-+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
+-+      }
+-+      //printf("%d\n", simpenrose_get_qpu_tick_count());
+-+   }
+++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
+++{
+++  return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
+++}
+ +
+-+   gpu_free(&out_buffer_ptr[0]);
+-+   gpu_free(&out_buffer_ptr[1]);
+-+   gpu_free(&in_buffer_ptr);
+-+   gpu_free(&unifs_ptr);
+++// Simple wrapper of start + delete
+++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
+++{
+++  int rv;
+++  rv = vpu_qpu_job_start(vqj);
+++  vpu_qpu_job_delete(vqj);
+++  return rv;
+++}
+ +
+-+   return 0;
+++unsigned int vpu_qpu_current_load(void)
+++{
+++  return gpu_ptr()->current_load;
+ +}
+ +
+-+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
+++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
+ +{
+-+  int x,y;
+-+  for (y=0; y<16; ++y) {
+-+    for (x=0; x<16; ++x) {
+-+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
+++  if (wait_h != NULL)
+++  {
+++    vq_wait_t * const wait = *wait_h;
+++    if (wait != NULL) {
+++      *wait_h = NULL;
+++      vq_wait_wait(wait);
+++      vq_wait_delete(wait);
+ +    }
+ +  }
+ +}
+ +
+-+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
+++int vpu_qpu_init()
+ +{
+-+   uint32_t *unifs;
+-+
+-+   GPU_MEM_PTR_T unifs_ptr;
+-+   //uint8_t *out_buffer;
+-+   //GPU_MEM_PTR_T out_buffer_ptr;
+-+
+-+   // Addresses in GPU memory of filter programs
+-+   uint32_t mc_setup = 0;
+-+   uint32_t mc_filter = 0;
+-+   uint32_t mc_exit = 0;
+-+   //int x,y;
+++  gpu_env_t * const ge = gpu_lock_ref();
+++  if (ge == NULL)
+++    return -1;
+ +
+-+   if (gpu==NULL) {
+-+      gpu_lock();
+-+      gpu_unlock();
+-+   }
+++  if (ge->init_count++ == 0)
+++  {
+++    vc_gpuserv_init();
+++  }
+ +
+-+   // Use table to compute locations of program start points
+-+   mc_setup = code[0] + gpu->vc;
+-+   mc_filter = code[1] + gpu->vc;
+-+   mc_exit = code[2] + gpu->vc;
+++  gpu_unlock();
+++  return 0;
+++}
+ +
+-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
+-+      return;
+-+   }
+-+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
+-+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
+++void vpu_qpu_term()
+++{
+++  gpu_env_t * const ge = gpu_lock();
+ +
+-+   /*for (y=0; y<16; ++y) {
+-+      for (x=0; x<16; ++x) {
+-+         out_buffer[x+y*dst_pitch] = 7;
+-+      }
+-+    }*/
+++  if (--ge->init_count == 0) {
+++    vc_gpuserv_deinit();
+ +
+-+   unifs = (uint32_t*)unifs_ptr.arm;
+-+
+-+    unifs[0] = mc_filter;
+-+    unifs[1] = (int)in_buffer_vc;
+-+    unifs[2] = src_pitch; // src pitch
+-+    unifs[3] = dst_pitch; // dst pitch
+-+    unifs[4] = 0; // Padding
+-+    unifs[5] = 0;
+-+    unifs[6] = 0;
+-+    unifs[7 ] = mc_exit;
+-+    unifs[8 ] = (int)in_buffer_vc;
+-+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
+-+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
+-+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
+-+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
+-+    unifs[13] = (int)dst_vc;
+-+    //unifs[13] = (int)out_buffer_ptr.vc;
+-+
+-+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
+-+
+-+    qpu_run_shader(mc_setup, unifs_ptr.vc);
+-+
+-+    /*for (y=0; y<16; ++y) {
+-+      for (x=0; x<16; ++x) {
+-+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
+-+      }
+-+    }*/
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+++    ttw_print(&ge->ttw, ns_time());
+++#endif
+++  }
+ +
+-+    gpu_free(&unifs_ptr);
+-+    //gpu_free(&out_buffer_ptr);
+++  gpu_unlock_unref(ge);
+ +}
+ +
+-+
+-+
+-+#endif
+++uint32_t qpu_fn(const int * const mc_fn)
+++{
+++  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
+++}
+ +
+ +#endif // RPI
+ diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+ new file mode 100644
+-index 0000000..c6cdb2b
++index 0000000..a95f7d9
+ --- /dev/null
+ +++ b/libavcodec/rpi_qpu.h
+-@@ -0,0 +1,176 @@
++@@ -0,0 +1,200 @@
+ +#ifndef RPI_QPU_H
+ +#define RPI_QPU_H
+ +
+-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
+-+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
+-+#define RPI_FAST_CACHEFLUSH
+-+
+ +#define RPI_ONE_BUF 1
+ +
+ +typedef struct gpu_mem_ptr_s {
+@@ -12570,9 +15559,7 @@ index 0000000..c6cdb2b
+ +// General GPU functions
+ +extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+ +extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
+-+extern void gpu_free(GPU_MEM_PTR_T *p);
+-+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
+-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
+++extern void gpu_free(GPU_MEM_PTR_T * const p);
+ +
+ +#include "libavutil/frame.h"
+ +#if !RPI_ONE_BUF
+@@ -12615,29 +15602,31 @@ index 0000000..c6cdb2b
+ +    return av_buffer_get_opaque(frame->buf[0]);
+ +}
+ +
+-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
+++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
+ +{
+ +    return av_buffer_pool_opaque(frame->buf[n]);
+ +}
+ +
+++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
+++{
+++    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
+++    return gm->vc + (frame->data[n] - gm->arm);
+++}
+++
+ +
+ +static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
+++    return get_vc_address3(frame, 0);
+ +}
+ +
+ +static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ?
+-+        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
+-+        gpu_buf3_gmem(frame, 1)->vc;
+++    return get_vc_address3(frame, 1);
+ +}
+ +
+ +static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
+-+    return gpu_is_buf1(frame) ?
+-+        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
+-+        gpu_buf3_gmem(frame, 2)->vc;
+++    return get_vc_address3(frame, 2);
+ +}
+ +
+-+
+++#if 0
+ +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+ +    if (gpu_is_buf1(frame))
+ +    {
+@@ -12674,48 +15663,74 @@ index 0000000..c6cdb2b
+ +    else
+ +        return *gpu_buf3_gmem(frame, 2);
+ +}
+-+
+++#endif
+ +#endif
+ +
+++// Cache flush stuff
+++
+++struct rpi_cache_flush_env_s;
+++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
+++
+++rpi_cache_flush_env_t * rpi_cache_flush_init(void);
+++// Free env without flushing
+++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
+++// Do the accumulated flush & free the env
+++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
+++
+++typedef enum
+++{
+++    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
+++    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
+++    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
+++} rpi_cache_flush_mode_t;
+++
+++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
+++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
+++  const unsigned int offset, const unsigned int size);
+++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
+++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
+++
+++// init, add, finish for one gm ptr
+++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+++
+ +
+ +// QPU specific functions
+-+extern void rpi_test_qpu(void);
+-+
+-+enum {
+-+  QPU_MC_SETUP,
+-+  QPU_MC_FILTER,
+-+  QPU_MC_EXIT,
+-+  QPU_MC_INTERRUPT_EXIT12,
+-+  QPU_MC_FILTER_B,
+-+  QPU_MC_FILTER_HONLY,
+-+  QPU_MC_SETUP_UV,
+-+  QPU_MC_FILTER_UV,
+-+  QPU_MC_FILTER_UV_B0,
+-+  QPU_MC_FILTER_UV_B,
+-+  QPU_MC_INTERRUPT_EXIT8,
+-+  QPU_MC_END
+-+  };
+-+extern unsigned int qpu_get_fn(int num);
+-+
+-+#define QPU_N_UV   8
+-+#define QPU_N_Y    12
+-+#define QPU_N_MAX  16
+++uint32_t qpu_fn(const int * const mc_fn);
+++
+++#define QPU_N_GRP_UV 4
+++#define QPU_N_UV     8
+++#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
+++#define QPU_N_Y      12
+++
+++#define QPU_MAIL_EL_VALS  2
+++
+++struct vpu_qpu_wait_s;
+++typedef struct vq_wait_s * vpu_qpu_wait_h;
+++
+++// VPU specific functions
+++
+++struct vpu_qpu_job_env_s;
+++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
+++
+++vpu_qpu_job_h vpu_qpu_job_new(void);
+++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
+++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
+++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
+++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
+++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
+++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
+++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
+ +
+-+#define QPU_MAIL_EL_VALS  2
+-+#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
+-+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
+-+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
+ +
+-+// VPU specific functions
+ +extern unsigned int vpu_get_fn(void);
+ +extern unsigned int vpu_get_constants(void);
+-+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
+-+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
+-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
+-+    int qpu0_n, const uint32_t * qpu0_mail,
+-+    int qpu1_n, const uint32_t * qpu1_mail);
+ +
+-+extern void vpu_wait( int id);
+++// Waits for previous post_codee to complete and Will null out *wait_h after use
+++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
+++unsigned int vpu_qpu_current_load(void);
+++int vpu_qpu_init(void);
+++void vpu_qpu_term(void);
+ +
+ +// Simple test of shader code
+ +extern int rpi_test_shader(void);
+@@ -12724,14 +15739,16 @@ index 0000000..c6cdb2b
+ +extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+ +
+ +extern int gpu_get_mailbox(void);
+++void gpu_ref(void);
+++void gpu_unref(void);
+ +
+ +#endif
+ diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+ new file mode 100644
+-index 0000000..06fb166
++index 0000000..0898ecd
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.c
+-@@ -0,0 +1,629 @@
++@@ -0,0 +1,670 @@
+ +#include "rpi_shader.h"
+ +
+ +#ifdef _MSC_VER
+@@ -12755,607 +15772,648 @@ index 0000000..06fb166
+ +__attribute__((aligned(8)))
+ +#endif
+ +unsigned int rpi_shader[] = {
+-+// ::mc_setup_uv
+-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
+-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
+-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
+-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
+-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
+-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
+-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
+-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
+-+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+-+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
+-+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
+-+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
+-+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
+-+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
+-+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
+-+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
+-+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
+-+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
+-+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
+-+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
+-+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
+-+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
+-+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
+-+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
+-+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+++// ::mc_setup_c
+++/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1          ; mov -, unif
+++/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+++/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1
+++/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+++/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+++/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+++/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+++/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
+++/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+++/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+++/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+++/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+++/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
+++/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
+++/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num      ; mov ra14, 0
+++/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0
+++/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b
+++/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+++/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1
+++/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4
+++/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
+++/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
+++/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
+++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1        ; mov r1, ra_y
+++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+++/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+++/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+++/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+++/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
+++/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+++/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+++/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+++/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
+++/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
+++/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif
+++/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+++/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5
+++/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+++/* [0x00000160] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+++/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+++/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+++/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+++/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a
+++/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b
+++/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+++/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0         ; mov -, unif
+++/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
+++/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1         ; mov -, unif
+++/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4
+++/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
+++/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
+++/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
+++/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_y2
+++/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+++/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+++/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+++/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+++/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif
+++/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0         ; mov -, unif
+++/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y  ; mov -, unif
+++/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+++/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0
+ +// ::mc_filter_uv
+-+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
+-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+-+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
+-+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
+-+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
+-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+-+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+-+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+-+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+-+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+-+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+-+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
+-+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
+-+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
+-+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
+-+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+-+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+-+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+-+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+++/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif     ; mov vw_setup, rb28
+++/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+++/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+++/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+++/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
+++/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
+++/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
+++/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+++/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
+++/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
+++/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
+++/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
+++/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1       ; mov ra1, unif
+++/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3       ; mov.ifnz ra1, unif
+++/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0,   r0, r2      ; mov rb8,  ra3.8a
+++/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0,   r0, 15      ; mov rb9,  ra3.8b
+++/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27    ; mov r1, ra1.16b
+++/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13      ; mov rb10, ra3.8c
+++/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+++/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+++/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+ +// :uvloop
+-+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+-+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+-+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
+++/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+++/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
+++/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+++/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+++/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
+++/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+++/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+++/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+++/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+++/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+++/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+++/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+++/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
+++/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
+++/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop
+++/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
+++/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+++/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
+++/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra14, rb10
+++/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra15, rb11
+++/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
+++/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop                   ; mul24 r1, r1, rb14
+++/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13
+++/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
+++/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop
+++/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13
+++/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1
+++/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+ +// ::mc_filter_uv_b0
+-+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
+-+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
+-+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
+-+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
+-+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
+-+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+-+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+-+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+-+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+-+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+-+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
+-+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
+-+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
+-+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
+-+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
+-+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
+-+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
+++/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif           ; mov vw_setup, rb28
+++/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+++/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+++/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+++/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
+++/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
+++/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
+++/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+++/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
+++/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
+++/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
+++/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
+++/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1
+++/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3       ; mov rb8,  ra3.8a
+++/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0,   r0, r2      ; mov rb9,  ra3.8b
+++/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0,   r0, 15      ; mov rb10, ra3.8c
+++/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+++/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+++/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif
+++/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif
+ +// :uvloop_b0
+-+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+-+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
+-+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+-+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
+-+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
+-+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
+-+// ::mc_filter_uv_b
+-+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+-+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
+-+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
+-+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
+-+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
+-+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
+-+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
+-+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+-+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
+-+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
+-+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+-+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
+-+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
+-+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
+-+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
+-+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
+-+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
+-+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
+-+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
+-+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
+-+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
+-+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
+-+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+++/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
+++/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+++/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
+++/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+++/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+++/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
+++/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+++/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+++/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+++/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+++/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+++/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+++/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+++/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
+++/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
+++/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+++/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
+++/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
+++/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
+++/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
+++/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
+++/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
+++/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
+++/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
+++/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+++/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
+++/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
+++/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
+++/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
+++/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
+++/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
+++/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16     ; mov ra_link, unif
+++/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
+++/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
+++/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
+++/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
+++/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
+++/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
+++/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
+++/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
+++/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
+++/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
+++/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
+++/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
+++/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
+++// :uv_b0_post12
+++/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
+++/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
+++/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
+++/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
+++/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
+++/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
+++// :uv_b0_post_fin
+++/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+++/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+++/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+++/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
+++/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
+++/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4
+++/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+++/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0        ; mov ra_y2_next, ra2.16a
+++/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
+++/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
+++/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+++/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif         ; mov rb9,  ra3.8b
+++/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif    ; mov rb10, ra3.8c
+++/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop                   ; mov rb11, ra3.8d
+++/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3
+++/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+ +// :uvloop_b
+-+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
+-+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
+-+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+-+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
+-+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+-+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
+-+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+-+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
+-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
+-+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
+-+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_exit
+-+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit8
+-+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1
+++/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
+++/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y2
+++/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
+++/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
+++/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
+++/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
+++/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
+++/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
+++/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
+++/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
+++/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
+++/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
+++/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+++/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b
+++/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+++/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
+++/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+++/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
+++/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
+++/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
+++/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
+++/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
+++/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
+++/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
+++/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
+++/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
+++/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
+++/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
+++/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
+++/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+++/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13
+++/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
+++/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3
+++/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++// ::mc_interrupt_exit8c
+++/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov  -, vw_wait ; nop ; ldtmu0
+++/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+ +/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+ +/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+ +/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+ +/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_exit
+++// ::mc_exit_c
+++/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
+++/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+++/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_interrupt_exit12
+++/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
+++/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+++/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++// ::mc_exit1
+++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+++/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1
+++/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+++/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+++/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop        ; nop
+ +// ::mc_setup
+-+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
+-+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
+-+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
+-+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+-+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+-+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
+-+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
+-+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
+-+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
+-+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
+-+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
+-+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
+-+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
+-+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
+-+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
+-+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
+-+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
+-+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
+-+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
+-+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+-+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
+-+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
+-+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
+-+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
+-+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+-+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+-+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
+-+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
+-+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+-+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
+-+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
+-+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
+-+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
+-+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
+-+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
+-+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
+-+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
+-+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
+-+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
+-+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
+-+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
+-+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
+-+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
+-+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
+-+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+-+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
+-+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
+-+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
+-+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+-+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
+-+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+-+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
+-+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+++/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
+++/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif
+++/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+++/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+++/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+++/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
+++/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+++/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or  rb24, r1, rb_pitch
+++/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+++/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
+++/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+++/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+++/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b
+++/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1
+++/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+++/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+++/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1
+++/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
+++/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+++/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b
+++/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+++/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+++/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+++/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1
+++/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+++/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+++/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+++/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
+++/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+++/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+++/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+++/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+++/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+++/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+++/* [0x00000be0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+++/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0
+++/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1
+++/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
+++/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base
+++/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+++/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+++/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+++/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2
+ +// :per_block_setup
+-+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
+-+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
+-+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
+-+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+-+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
+-+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+-+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
+-+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
+-+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
+-+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
+-+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
+-+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
+-+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
+-+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
+-+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
+-+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
+-+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
+-+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
+-+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+-+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+-+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
+-+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
+-+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+-+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+-+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+-+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
+-+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+-+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+-+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+-+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+-+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+-+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+-+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+-+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+-+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+-+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+-+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+-+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+-+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+-+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+-+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+-+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+-+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+-+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+-+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
+-+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
+-+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
+-+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
+-+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
+-+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
+-+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
+-+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
+++/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif         ; mov r3, elem_num
+++/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next
+++/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next
+++/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
+++/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
+++/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+++/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0
+++/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b
+++/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif
+++/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
+++/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0
+++/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
+++/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif
+++/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+++/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width
+++/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5  ; mov r0, ra_height
+++/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16
+++/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+++/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+++/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
+++/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
+++/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+++/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+++/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
+++/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
+++/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
+++/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+++/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+++/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+++/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
+++/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
+++/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+++/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+++/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+++/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+++/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+++/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+++/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+++/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+++/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+++/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+++/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+++/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+++/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+++/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+++/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+++/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+++/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
+++/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
+++/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
+++/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
+++/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
+++/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
+++/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
+++/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
+++/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
+++/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
+ +// ::mc_filter
+-+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
+++/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
+ +// :yloop
+-+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+-+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+-+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+-+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+-+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+-+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+-+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+-+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+-+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+-+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+-+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+-+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+-+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+-+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+-+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+-+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+-+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+-+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+-+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+-+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+-+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+-+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+-+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+-+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
+-+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+++/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+++/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
+++/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
+++/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+++/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+++/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+++/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+++/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+++/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+++/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+++/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+++/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+++/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+++/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+++/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+++/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+++/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+++/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+++/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16
+++/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
+++/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0
+++/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
+++/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup
+++/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
+++/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+++/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
+++/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
+++/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
+++/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
+++/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
+++/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
+++/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+++/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop
+++/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop
+ +// ::mc_filter_b
+-+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
+ +// :yloopb
+-+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
+-+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
+-+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+-+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
+-+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+-+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+-+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+-+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+-+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+-+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+-+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+-+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+-+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+-+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+-+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+-+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+-+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+-+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+-+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+-+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+-+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+-+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+-+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+-+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+-+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+-+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+-+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+-+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+-+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+-+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
+-+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_interrupt_exit12
+-+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit1
+-+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
+++/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+++/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+++/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
+++/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+++/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+++/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
+++/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+++/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+++/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++/* [0x00001188] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+++/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+++/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+++/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+++/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+++/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+++/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+++/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+++/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+++/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+++/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+++/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+++/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+++/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+++/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+++/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+++/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+++/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+++/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
+++/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+++/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+++/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+++/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+++/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+++/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+++/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16
+++/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
+++/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0
+++/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
+++/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup
+++/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+++/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+++/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
+++/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+++/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
+++/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
+++/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
+++/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
+++/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
+++/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
+++/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+++/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb
+++/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop
+ +// ::mc_end
+ +};
+ +#ifdef __HIGHC__
+@@ -13363,7 +16421,7 @@ index 0000000..06fb166
+ +#endif
+ diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+ new file mode 100644
+-index 0000000..9772796
++index 0000000..d17b9fd
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.h
+ @@ -0,0 +1,19 @@
+@@ -13372,26 +16430,33 @@ index 0000000..9772796
+ +
+ +extern unsigned int rpi_shader[];
+ +
+-+#define mc_setup_uv (rpi_shader + 0)
+-+#define mc_filter_uv (rpi_shader + 132)
+-+#define mc_filter_uv_b0 (rpi_shader + 274)
+-+#define mc_filter_uv_b (rpi_shader + 392)
+-+#define mc_exit (rpi_shader + 540)
+-+#define mc_interrupt_exit8 (rpi_shader + 558)
+-+#define mc_setup (rpi_shader + 588)
+-+#define mc_filter (rpi_shader + 872)
+-+#define mc_filter_b (rpi_shader + 992)
+-+#define mc_interrupt_exit12 (rpi_shader + 1114)
+-+#define mc_exit1 (rpi_shader + 1152)
+-+#define mc_end (rpi_shader + 1168)
+++#define mc_setup_c (rpi_shader + 0)
+++#define mc_filter_uv (rpi_shader + 152)
+++#define mc_filter_uv_b0 (rpi_shader + 280)
+++#define mc_interrupt_exit8c (rpi_shader + 554)
+++#define mc_exit (rpi_shader + 582)
+++#define mc_exit_c (rpi_shader + 582)
+++#define mc_interrupt_exit12 (rpi_shader + 598)
+++#define mc_exit1 (rpi_shader + 634)
+++#define mc_setup (rpi_shader + 650)
+++#define mc_filter (rpi_shader + 942)
+++#define mc_filter_b (rpi_shader + 1094)
+++#define mc_end (rpi_shader + 1246)
+ +
+ +#endif
+ diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+ new file mode 100644
+-index 0000000..aa9e1e7
++index 0000000..aa3fe47
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.qasm
+-@@ -0,0 +1,1098 @@
++@@ -0,0 +1,1259 @@
+++
+++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
+++# the warning that we are using rotation & ra/rb registers. r0..3 can be
+++# rotated through all 16 elems ra regs can only be rotated through their
+++# local 4.  As it happens this is what is wanted here as we do not want the
+++# constants from the other half of the calc.
+++
+ +# register allocation
+ +#
+ +# ra0...ra7                                     eight horizontal filter coefficients
+@@ -13408,32 +16473,32 @@ index 0000000..aa9e1e7
+ +#
+ +# rb8...rb11                                    eight vertical filter coefficients
+ +
+-+# ra4                                           y: Fiter, UV: 0x10000
+++# ra4                                           y: Fiter, UV: part -of b0 -> b stash
+ +
+ +# rb12                                          offset to add before shift (round + weighting offsets)
+ +# rb13                                          shift: denom + 6 + 9
+ +# rb14                                          L0 weight (U on left, V on right)
+ +# rb15                                          -- free --
+ +#
+-+# ra16                                          clipped(row start address+elem_num)&~3
+-+# ra17                                          per-channel shifts
+++# ra16                                          width:height
+++# ra17                                          ra_y:ra_xshift
+ +# ra18                                          L1 weight (Y)
+-+# ra19                                          next ra17
+++# ra19                                          ra_y_next:ra_xshift_next
+ +#
+ +# rb16                                          pitch
+ +# rb17                                          height + 1
+-+# rb18                                          height + 3
+-+# rb19                                          next ra16
+++# rb18                                          max(height,16) + 3
+++# rb19                                          frame_base2_next
+ +#
+ +# ra20                                          1
+-+# ra21                                          ra_21
+++# ra21                                          ra_y2_next:ra_y2 (luma); free (chroma)
+ +# ra22 ra_k256                                  256
+-+# ra23 ra_y2_next                               ra_y2_next
+++# ra23                                          0
+ +#
+-+# rb20                                          0xffffff00
+-+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
+++# rb20                                          -- free --
+++# rb21                                          -- free --
+ +# rb22 rb_k255                                  255
+-+# rb23                                          24
+++# rb23                                          dest (Y)
+ +#
+ +# rb24                                          vdw_setup_1(dst_pitch)
+ +# rb25                                          frame width-1
+@@ -13444,146 +16509,233 @@ index 0000000..aa9e1e7
+ +# rb30                                          frame height-1
+ +# rb31                                          used as temp to count loop iterations
+ +#
+-+# ra24                                          clipped(row start address+8+elem_num)&~3
+-+# ra25                                          per-channel shifts 2
+++# ra24                                          src frame base
+++# ra25                                          src frame base 2
+ +# ra26                                          next ra24
+ +# ra27                                          next ra25
+-+# ra28                                          next y
+-+# ra29                                          y for next texture access
+-+# ra30                                          64
+++# ra28                                          -- free --
+++# ra29                                          -- free --
+ +#
+-+# ra31                                          next kernel address
+++# Use an even numbered register as a link register to avoid corrupting flags
+++# ra30                                          next kernel address
+++# ra31                                          chroma-B height+3; free otherwise
+ +
+-+.set rb_frame_width_minus_1,       rb25
+-+.set rb_frame_height_minus_1,      rb30
+++.set rb_max_x,                     rb25
+++.set rb_max_y,                     rb30
+ +.set rb_pitch,                     rb16
+-+.set ra_x,                         ra16
+++.set ra_width_height,              ra16
+++.set ra_width,                     ra16.16b
+++.set ra_height,                    ra16.16a
+ +.set ra_y2,                        ra21.16a
+ +.set ra_y2_next,                   ra21.16b
+ +
+-+.set rb_x_next,                    rb19
+-+.set rx_frame_base2_next,          rb19
+++.set rb_base2_next,                rb19
+ +
+-+.set ra_frame_base,                ra24
+-+.set ra_frame_base_next,           ra26
+-+.set ra_xshift,                    ra17
+++.set rb_dest,                      rb23
+++.set ra_base,                      ra24
+++.set ra_base_next,                 ra26
+++.set ra_xshift,                    ra17.16a
+ +
+-+.set ra_u2v_ref_offset,            ra25
+-+.set ra_frame_base2,               ra25
+++.set ra_base2,                     ra25
+ +
+-+.set ra_xshift_next,               ra19
+-+.set rx_xshift2,                   rb0
+-+.set rx_xshift2_next,              rb1
+++# Note ra_xy & ra_xy_next should have same structure!
+++.set ra_xshift_next,               ra19.16a
+++.set rb_xshift2,                   rb0
+++.set rb_xshift2_next,              rb1
+ +
+-+.set ra_u2v_dst_offset,            ra27
+-+
+-+.set ra_y_next,                    ra28
+-+.set ra_y,                         ra29
+++.set ra_y_next,                    ra19.16b
+++.set ra_y,                         ra17.16b
+ +
+ +.set ra_k1,                        ra20
+++.set rb_xpitch,                    rb20
+ +.set rb_k255,                      rb22
+ +.set ra_k256,                      ra22
+++.set ra_k0,                        ra23
+++
+++.set ra_link,                      ra30
+ +
+ +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+ +.set i_shift16,                    -16
+ +.set i_shift21,                    -11
+++.set i_shift23,                     -9
+++.set i_shift30,                     -2
+ +
+-+################################################################################
+-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+-+::mc_setup_uv
+++# Much of the setup code is common between Y & C
+++# Macros that express this - obviously these can't be overlapped
+++# so are probably unsuitable for loop code
+++
+++.macro m_calc_dma_regs, r_vpm, r_dma
+++  mov r2, qpu_num
+++  asr r1, r2, 2
+++  shl r1, r1, 6
+++  and r0, r2, 3
+++  or  r0, r0, r1
+ +
+-+# Read starting kernel
+-+mov ra31, unif
+++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+++  add r_vpm, r0, r1  # VPM 8bit storage
+++
+++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+++  shl r0, r0, 5
+++  add r_dma, r0, r1  # DMA out
+++.endm
+++
+++# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16
+++.macro m_calc_dma_regs_c, r_vpm, r_dma
+++  mov r2, qpu_num
+++  asr r1, r2, 1
+++  shl r1, r1, 5
+++  and r0, r2, 1
+++  or  r0, r0, r1
+++
+++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+++  add r_vpm, r0, r1  # VPM 8bit storage
+++
+++  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
+++  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
+++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
+++  shl r0, r0, 6
+++  add r_dma, r0, r1  # DMA out
+++.endm
+++
+++
+++################################################################################
+++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+++::mc_setup_c
+++  mov tmurs, 1          ; mov -, unif        # No swap TMUs ; Next fn (ignored)
+ +
+ +# Load first request location
+-+add ra_x, unif, elem_num # Store x
+-+mov ra_y, unif # Store y
+-+mov ra_frame_base, unif # Store frame u base
+-+nop
+-+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+++  mov ra0, unif         # next_x_y
+++
+++  mov ra_base, unif                             # Store frame c base
+ +
+ +# Read image dimensions
+-+sub rb25,unif,1
+-+sub rb30,unif,1
+++  sub rb_max_x, unif, 1     # pic c width
+++  sub rb_max_y, unif, 1     # pic c height
+++
+++# load constants
+++  mov ra_k1, 1
+++  mov ra_k256, 256
+++  mov rb_k255, 255
+++  mov ra_k0, 0
+++
+++# touch registers to keep simulator happy
+++
+++  # ra/b4..7: B0 -> B stash registers
+++  mov ra4, 0 ; mov rb4, 0
+++  mov ra5, 0 ; mov rb5, 0
+++  mov ra6, 0 ; mov rb6, 0
+++  mov ra7, 0 ; mov rb7, 0
+++
+++  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
+ +
+++# ; ra12..15: vertical scroll registers
+ +# get source pitch
+-+mov rb16, unif
+++  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
+++  mov rb_pitch, unif    ; mov ra13, 0           # stride1
+++  mov r0, elem_num      ; mov ra14, 0
+++# get destination vdw setup
+++  add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
+ +
+-+# get destination pitch
+-+mov r0, unif
+-+mov r1, vdw_setup_1(0)
+-+add rb24, r1, r0
+++# Compute base address for first and second access
+++# ra_base ends up with t0s base
+++# ra_base2 ends up with t1s base
+ +
+-+# load constants
+++  add r0, r0, ra0.16b                           # Add elem no to x to get X for this slice
+++  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
+++  min r0, r0, rb_max_x
+ +
+-+mov ra4, 0x10000
+-+mov ra_k1, 1
+-+mov ra_k256, 256
+-+mov ra30, 64
+++# Get shift
+++  and r1, r0, 1
+++  shl ra_xshift_next, r1, 4
+ +
+-+mov rb20, 0xffffff00
+-+mov rb_k255, 255
+-+mov rb23, 24
+++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+ +
+-+# touch vertical context to keep simulator happy
+++  and r0, r0, -2
+++  add r0, r0, r0        ; v8subs r1, r1, r1
+++  sub r1, r1, rb_pitch
+++  and r1, r0, r1
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov r1, ra_y
+++  add ra_base, ra_base, r0
+ +
+-+mov ra8, 0
+-+mov ra9, 0
+-+mov ra10, 0
+-+mov ra11, 0
+-+mov ra12, 0
+-+mov ra13, 0
+-+mov ra14, 0
+-+mov ra15, 0
+++  max r0, r1, 0
+++  min r0, r0, rb_max_y
+ +
+-+# Compute base address for first and second access
+-+mov r0, ra_x           # Load x
+-+max r0, r0, 0; mov r1, ra_y # Load y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
+-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+-+add ra_y, r1, 1
+-+add r0, r0, r3
+-+and r0, r0, ~3
+-+max r1, r1, 0 ; mov ra_x, r0 # y
+-+min r1, r1, rb_frame_height_minus_1
+ +# submit texture requests for first line
+-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+-+add t0s, r0, r1 ; mov ra_frame_base, r2
+-+add t1s, r2, r1
+++  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+++  add t0s, ra_base, r0
+++
+++# submit texture requests for 2nd line
+++
+++  max r0, r1, 0
+++  min r0, r0, rb_max_y
+++
+++  add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+++  add t0s, ra_base, r0
+ +
+-+mov r2, 9
+-+add rb13, r2, unif  # denominator
+-+mov -, unif         # Unused
+++  add rb13, 9, unif     # denominator
+++  mov -, unif           # Unused
+ +
+ +# Compute part of VPM to use for DMA output
+-+mov r2, unif
+-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
+-+and r2, r2, 15
+-+mov r1, r2
+-+asr r1, r1, 2
+-+shl r1, r1, 6
+-+mov r0, r2
+-+and r0, r0, 3
+-+add r0, r0, r1
+-+
+-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+add rb28, r0, r1  # VPM 8bit storage
+-+asr r2, r0, 1     # r0 = bc0000d
+-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
+-+add rb21, r2, r1  # VPM for 16bit intermediates
+-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+shl r0, r0, 5
+-+add rb27, r0, r1  # DMA out
+++  m_calc_dma_regs_c rb28, rb27
+++
+++# -----------------
+++# And again for L1, but only worrying about frame2 stuff
+++
+++  mov ra_link, unif        # Next fn
+++
+++# Load first request location
+++  mov ra0, unif            # next_x_y
+++
+++  mov ra_base2, unif # Store frame c base
+++
+++# Compute base address for first and second access
+++# ra_base ends up with t0s base
+++# ra_base2 ends up with t1s base
+++
+++  mov ra_y2, ra0.16a       # Store y
+++  mov r0, ra0.16b          # Load x
+++  add r0, r0, elem_num     # Add QPU slice
+++  max r0, r0, 0         ; mov -, unif           # Unused 0
+++  min r0, r0, rb_max_x  ; mov -, unif           # Unused 1
+++
+++# Get shift
+++  and r1, r0, 1         ; mov -, unif           # Unused 2
+++  shl rb_xshift2_next, r1, 4
+++
+++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+++
+++  and r0, r0, -2
+++  add r0, r0, r0        ; v8subs r1, r1, r1
+++  sub r1, r1, rb_pitch
+++  and r1, r0, r1
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov r1, ra_y2
+++  add ra_base2, ra_base2, r0
+++
+++  max r0, r1, 0
+++  min r0, r0, rb_max_y
+++
+++# submit texture requests for first line
+++  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+++  add t1s, ra_base2, r0 ; mov -, unif           # Unused 3
+++
+++# submit texture requests for 2nd line
+++
+++  max r0, r1, 0         ; mov -, unif           # Unused 4
+++
+++  bra -, ra_link
+++
+++  min r0, r0, rb_max_y  ; mov -, unif           # Unused 5
+++  add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+++  add t1s, ra_base2, r0
+++
+++# >>> ra_link
+ +
+-+# submit texture requests for second line
+-+max r1, ra_y, 0
+-+min r1, r1, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1
+-+bra -, ra31
+-+nop ; mul24 r1, r1, rb_pitch
+-+add t0s, r1, ra_x
+-+add t1s, r1, ra_frame_base
+ +
+++.macro setf_nz_if_v
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++.endm
+ +
+ +
+ +################################################################################
+@@ -13593,51 +16745,51 @@ index 0000000..aa9e1e7
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# ra_x, ra_x16_base point to the current coordinates for this block
+ +::mc_filter_uv
+-+mov ra31, unif
+++  mov ra_link, unif     ; mov vw_setup, rb28    # ; x_y
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0         ; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+# compute offset from frame base u to frame base v
+-+sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
+-+shl ra_xshift_next, r0, 3
+-+add r0, r0, r3        ; mov ra1, unif  # ; width_height
+-+and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
+-+mov ra_y_next, r1     ; mov vw_setup, rb28
+-+add ra_frame_base_next, rb_x_next, r2
+++  mov ra2, unif         ; mov r0, elem_num
+++
+++  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++
+++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
+++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
+++
+++  shl ra_xshift_next, r0, 4
+++
+++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+++  add r0, r0, r0        ; mov ra_y_next, ra2.16a
+++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
+++  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+ +
+ +# set up VPM write
+-+# get width,height of block
+ +
+-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+-+add rb17, ra1.16a, 1
+-+add rb18, ra1.16a, 3
+-+shl r0,   ra1.16a, 7
+-+add r0,   r0, ra1.16b    # Combine width and height of destination area
+-+shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
+++  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
+++  add rb17, r1, 1       ; mov ra1, unif         # ; U offset/weight
+++  add rb18, r1, 3       ; mov.ifnz ra1, unif    # ; V offset/weight
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++# ; unpack filter coefficients
+ +
+-+# unpack filter coefficients
+++  add r0,   r0, r2      ; mov rb8,  ra3.8a      # Combine width and height of destination area
+++  shl r0,   r0, 15      ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb26, r0, rb27    ; mov r1, ra1.16b       # ; r1=weight
+ +
+-+mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
+-+mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
+-+nop                   ; mov rb10, ra3.8c
+-+mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+++  shl r1, r1, rb13      ; mov rb10, ra3.8c
+++  mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+ +
+-+shl r1, ra1.16b, rb13
+-+asr rb12, r1, 1
+-+shl rb14, ra1.16a, 1  # b14 = weight*2
+++  asr rb12, r1, 1
+++  shl rb14, ra1.16a, 1  # b14 = weight*2
+ +
+ +# rb14 - weight L0 * 2
+ +# rb13 = weight denom + 6 + 9
+ +# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+ +
+-+# r2 is elem_num
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+@@ -13646,123 +16798,114 @@ index 0000000..aa9e1e7
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
+++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+++  shr r1, r0, 8         ; mov.ifnz r3, ra_y
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t1s, ra_frame_base, r2
+++  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+++  min r2, r2, rb_max_y
+++  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++  setf_nz_if_v
+ +
+ +# apply horizontal filter
+-+nop                  ; mul24      r3, ra0.8a,       r0
+-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+sub r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+-+brr.anyn -, r:uvloop
+-+mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+mov ra14, ra15
+-+mov ra15, r0            ; mul24 r0, ra12, rb8
+++# The filter coeffs for the two halves of this are the same (unlike in the
+++# Y case) so it doesn't matter which ra0 we get them from
+++
+++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+++  sub r0, r2, r3        ; mov r3, rb31
+++  sub.setf -, r3, 4     ; mov ra12, ra13
+++  brr.anyn -, r:uvloop
+++  mov ra13, ra14        ; mul24 r1, ra14, rb9
+++  mov ra14, ra15
+++  mov ra15, r0          ; mul24 r0, ra12, rb8
+ +# >>> .anyn uvloop
+ +
+ +# apply vertical filter and write to VPM
+ +
+-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+sub r1, r1, r0          ; mov -, vw_wait
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+asr r1, r1, 14
+-+nop                     ; mul24 r1, r1, rb14
+-+shl r1, r1, 8
+-+
+-+add r1, r1, rb12
+-+brr.anyn -, r:uvloop
+-+asr r1, r1, rb13
+-+min r1, r1, rb_k255       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out for U
+-+
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+-+
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+-+
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+++  sub r1, r1, r0        ; mul24 r0, ra14, rb10
+++  add r1, r1, r0        ; mul24 r0, ra15, rb11
+++  sub r1, r1, r0
+++  sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
+++  asr r1, r1, 14
+++  nop                   ; mul24 r1, r1, rb14
+++  shl r1, r1, 8
+ +
+++  add r1, r1, rb12
+++  asr ra1.8as, r1, rb13
+++  nop                   ; mov r1, r1 << 8
+++  brr.anyn -, r:uvloop
+++  asr ra1.8bs, r1, rb13
+++  mov -, vw_wait
+++  mov vpm, ra1
+++
+++# >>>
+++
+++# DMA out for U & stash for V
+++  bra -, ra_link
+++  mov vw_setup, rb26
+++  mov vw_setup, rb29
+++  mov vw_addr, unif     # u_dst_addr
+++# >>>
+ +
+ +################################################################################
+ +
+-+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+++# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+ +
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# ra_x, ra_x16_base point to the current coordinates for this block
+ +::mc_filter_uv_b0
+-+mov ra31, unif
+++  mov -, unif           ; mov vw_setup, rb28    # next_fn ignored - always uv_b
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num       # x
+-+max r0, r0, 0                ; mov r1, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
+-+sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
+-+shl ra_xshift_next, r0, 3
+-+add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
+-+and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
+-+mov ra_y_next, r1            ; mov vw_setup, rb21
+-+
+-+add ra_frame_base_next, rb_x_next, r2
+-+
+-+# Need to have unsigned coeffs to so we can just unpack in the filter
+-+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
+-+# filter code. Unpack into b regs for V
+-+
+-+# set up VPM write, we need to save 16bit precision
+-+
+-+sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
+-+add rb17, ra1.16a, 1
+-+add rb18, ra1.16a, 3
+-+shl r0,   ra1.16a, 7
+-+add r0,   r0, ra1.16b           # Combine width and height of destination area
+-+shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
+-+add rb26, r0, rb27
+-+
+-+mov rb8, ra3.8a
+-+mov rb9, ra3.8b
+-+mov rb10, ra3.8c
+-+mov rb11, ra3.8d
+-+
+-+# r2 is elem_num
+-+# r3 is loop counter
+++  mov ra2, unif         ; mov r0, elem_num
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++
+++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
+++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
+++
+++  shl ra_xshift_next, r0, 4
+++
+++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+++  add r0, r0, r0        ; mov ra_y_next, ra2.16a
+++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
+++  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++
+++# set up VPM write
+++
+++  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
+++  add rb17, r1, 1
+++  add ra31, r1, 3       ; mov rb8,  ra3.8a      # Combine width and height of destination area
+++
+++# ; unpack filter coefficients
+++
+++  add r0,   r0, r2      ; mov rb9,  ra3.8b
+++  shl r0,   r0, 15      ; mov rb10, ra3.8c      # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb26, r0, rb27
+++
+++  mov r3, 0             ; mov rb11, ra3.8d      # Loop count
+++
+++  mov rb14, unif                                # U weight
+++  mov.ifnz rb14, unif                           # V weight
+ +
+-+mov      rb14, unif                 # U weight L0
+-+mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
+ +# rb14 unused in b0 but will hang around till the second pass
+ +
+ +# retrieve texture results and pick out bytes
+@@ -13773,108 +16916,143 @@ index 0000000..aa9e1e7
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
+++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+++  shr r1, r0, 8         ; mov.ifnz r3, ra_y
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
+-+add t1s, ra_frame_base, r2
+++  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+++  min r2, r2, rb_max_y
+++  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+nop                  ; mul24      r3, ra0.8a,       r0
+-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+sub r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+-+brr.anyn -, r:uvloop_b0
+-+mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
+-+mov ra14, ra15
+-+mov ra15, r0            ; mul24 r0, ra12, rb8
+-+# >>> .anyn uvloop_b0
+-+
+-+# apply vertical filter and write to VPM
+++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+sub.setf -, r3, rb18
+-+brr.anyn -, r:uvloop_b0
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+sub r1, r1, r0          ; mov -, vw_wait
+-+asr vpm, r1, 6
+++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0  # Need to wait 1 cycle for rotated r1
+++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+++  sub r0, r2, r3        ; mov r3, rb31
+++  sub.setf -, r3, 4     ; mov ra12, ra13
+++  brr.anyn -, r:uvloop_b0
+++  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
+++  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
+++  mov ra15, r0          ; mul24 r0, ra12, rb8
+ +# >>> .anyn uvloop_b0
+ +
+-+# in pass0 we don't really need to save any results, but need to discard the uniforms
+-+# DMA out for U
+-+
+-+bra -, ra31
+-+mov -, unif           # Delay 1
+-+mov -, unif           # Delay 2
+-+nop                   # Delay 3
+-+
+-+
+-+################################################################################
+-+
+-+::mc_filter_uv_b
+-+mov ra31, unif
+++# apply vertical filter and write to B-FIFO
+++
+++  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
+++  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
+++  sub r1, r1, r0        ; mov ra7, rb6
+++
+++# FIFO goes:
+++# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
+++# This arrangement optimizes the inner loop FIFOs at the expense of making the
+++# bulk shift between loops quite a bit nastier
+++# a8 used as temp
+++
+++  sub.setf -, r3, ra31
+++  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
+++  brr.anyn -, r:uvloop_b0
+++  mov ra5, rb4          ; mov rb4, ra4
+++  mov ra4, rb5          ; mov rb5, ra6
+++  mov ra6, rb7          ; mov rb7, ra8
+++# >>>
+++
+++# 1st half done all results now in the a/b4..7 fifo
+++
+++# Need to bulk rotate FIFO for heights other than 16
+++# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with
+++# we are allowed 3/4 cb_size w/h :-(
+++
+++# Destination uniforms discarded
+++# At the end drop through to _b - we will always do b after b0
+++
+++  sub.setf -, 15, r3    # 12 + 3 of preroll
+++  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
+++  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
+++  mov r0, i_shift16     ; mov ra_link, unif
+++  mov r1, 0x10000
+++# >>>
+++  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
+++# If h != 16 && h != 12 then h <= 8 so
+++# shift 8 with discard (.16b = .16a on all regs)
+++  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
+++  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
+++  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
+++# >>>
+++  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
+++
+++  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
+++# Shift 4
+++  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
+++  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
+++  # If we shifted by 4 here then the max length remaining is 4
+++  # so that is it
+++
+++  brr -, r:uv_b0_post_fin
+++# Shift 2
+++  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
+++  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
+++  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
+++  # 6 / 2 so need 6 outputs
+++# >>>
+++
+++:uv_b0_post12
+++# this one is annoying as we need to swap halves of things that don't
+++# really want to be swapped
+++
+++# b7a, a6a, b5a, a4a
+++# b4a, a5a, b6a, a7a
+++# b7b, a6b, b5b, a4b
+++# b4b, a5b, b6b, a7b
+++
+++  mov r2, ra6           ; mov r3, rb7
+++  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
+++  mov ra5, r2           ; mov rb4, r3
+++
+++  mov r2,  ra4          ; mov r3,  rb5
+++  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
+++  mov ra7, r2           ; mov rb6, r3
+++
+++:uv_b0_post_fin
+++
+++##### L1 B processing
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +
+-+# set up VPM write
+-+mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
+-+
+ +# get base addresses and per-channel shifts for *next* invocation
+-+add r0, unif, elem_num    # x
+-+max r0, r0, 0                      ; mov ra_y_next, unif # y
+-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
+-+# compute offset from frame base u to frame base v
+-+sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
+-+add r0, r0, r3                     ; mov ra1, unif       # width_height
+-+and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
+++  mov ra2, unif         ; mov r0, elem_num
+ +
+-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
+-+add rb17, ra1.16a, 1
+-+add rb18, ra1.16a, 3
+-+shl r0,   ra1.16a, 7
+++  setf_nz_if_v                                  # Also acts as delay slot for ra2
+ +
+-+add ra_frame_base_next, rb_x_next, r2
+++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+++  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+++  min r0, r0, rb_max_x  ; mov -, unif           # ; width_height
+ +
+-+# r0 is currently height<<7
+-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
+-+shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
+-+shr r3, r3, 8
+-+add vr_setup, r3, rb21
+++  shl rb_xshift2_next, r0, 4
+ +
+-+add r0, r0, ra1.16b    # Combine width and height of destination area
+-+shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
+-+add rb26, r0, rb27
+++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+++  add r0, r0, r0        ; mov ra_y2_next, ra2.16a
+++  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
+++  add rb_base2_next, r3, r0
+ +
+-+# get filter coefficients
+++  mov ra1, unif         ; mov rb9,  ra3.8b      # U offset/weight
+++  mov.ifnz ra1, unif    ; mov rb10, ra3.8c      # V offset/weight
+ +
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+# Get offset & weight stuff
+-+
+-+# The unif read occurs unconditionally, only the write is conditional
+-+mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
+-+mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
+-+nop                 ; mov rb10, ra3.8c
+-+mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
+-+
+-+shl r1, ra1.16b, rb13
+-+asr rb12, r1, 1
+++  nop                   ; mov rb11, ra3.8d
+++  shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3     # ; r3 (loop counter)  = 0
+++  asr rb12, r1, 1
+ +
+ +# ra1.16a used directly in the loop
+ +
+@@ -13882,125 +17060,147 @@ index 0000000..aa9e1e7
+ +# then submit two more texture requests
+ +
+ +# r3 = 0
+++
+ +:uvloop_b
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
+-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
+-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
+++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1     # loop counter increment
+++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
+++  shr r1, r0, 8         ; mov.ifnz r3, ra_y2
+ +
+-+max r2, ra_y, 0  # y
+-+min r2, r2, rb_frame_height_minus_1
+-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
+-+add t0s, ra_x, r2         ; v8subs r1, r1, rb20
+-+add t1s, ra_frame_base, r2
+++  max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
+++  min r2, r2, rb_max_y
+++  add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
+++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+ +
+ +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+nop                  ; mul24      r3, ra0.8a,       r0
+-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
+-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
+-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
+-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
+-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
+-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+sub r0, r2, r3       ; mov r3, rb31
+-+sub.setf -, r3, 4    ; mov ra12, ra13
+-+brr.anyn -, r:uvloop_b
+-+mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+mov ra14, ra15
+-+mov ra15, r0            ; mul24 r0, ra12, rb8
+++  and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
+++  nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
+++  sub r0, r2, r3       ; mov r3, rb31
+++  sub.setf -, r3, 4    ; mov ra12, ra13
+++  brr.anyn -, r:uvloop_b
+++  mov ra13, ra14          ; mul24 r1, ra14, rb9
+++  mov ra14, ra15          ; mul24 r2, ra15, rb10
+++  mov ra15, r0            ; mul24 r0, ra12, rb8
+ +# >>> .anyn uvloop_b
+ +
+ +# apply vertical filter and write to VPM
+ +
+-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
+-+add r1, r1, r0          ; mul24 r0, ra15, rb11
+-+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
+-+sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
+-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+asr r1, r1, 14          # shift2=6
+-+
+-+asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
+-+nop                     ; mul24 r0, r0, rb14
+-+
+-+add r1, r1, r0          ; mov -, vw_wait
+-+shl r1, r1, 8           # Lose bad top 8 bits & sign extend
+++  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
+++  add r1, r1, r2        ; mul24 r0, ra15, rb11
+++  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
+++  mov ra7, rb6          ; mul24 r1, r1, ra_k256
+++  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
+ +
+-+add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+++  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
+++  add r1, r1, r0        ; mov rb4, ra4
+ +
+-+brr.anyn -, r:uvloop_b
+-+asr r1, r1, rb13         # Delay 1
+-+min r1, r1, rb_k255       # Delay 2
+-+max vpm, r1, 0         # Delay 3
+++  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
+++  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+ +
+++  sub.setf -, r3, ra31  ; mov ra6, rb7
+++  asr ra3.8as, r1, rb13
+++  nop                   ; mov r1, r1 << 8
+++  brr.anyn -, r:uvloop_b
+++  asr ra3.8bs, r1, rb13
+++  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
+++  mov vpm, ra3
+++# >>>
+ +
+-+# DMA out for U
+-+
+-+mov vw_setup, rb26 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+++# DMA out
+ +
+-+# DMA out for V
+-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
+-+# Could potentially push this write into the start of the next pipeline stage.
+-+mov r0, 16
+-+mov -, vw_wait
+++  bra -, ra_link
+++  mov vw_setup, rb26
+++  mov vw_setup, rb29
+++  mov vw_addr, unif     # c_dst_addr
+ +
+-+bra -, ra31
+-+add vw_setup, rb26, r0 # VDW setup 0
+-+mov vw_setup, rb29 # Stride
+-+mov vw_addr, unif # start the VDW
+ +
+ +################################################################################
+ +
+ +# mc_exit()
+ +
+-+::mc_exit
+-+mov  -, vw_wait # wait on the VDW
+++::mc_interrupt_exit8c
+++  ldtmu0
+++  ldtmu1
+++  ldtmu1
+++  mov  -, vw_wait ; nop ; ldtmu0  # wait on the VDW
+++
+++  mov -,sacq(0) # 1
+++  mov -,sacq(0) # 2
+++  mov -,sacq(0) # 3
+++  mov -,sacq(0) # 4
+++  mov -,sacq(0) # 5
+++  mov -,sacq(0) # 6
+++  mov -,sacq(0) # 7
+++#  mov -,sacq(0) # 8
+++#  mov -,sacq(0) # 9
+++#  mov -,sacq(0) # 10
+++#  mov -,sacq(0) # 11
+ +
+-+mov -,srel(0)
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+ +
+-+ldtmu0
+-+ldtmu1
+-+ldtmu0
+-+ldtmu1
+++# Chroma & Luma the same now
+++::mc_exit_c
+++::mc_exit
+++  ldtmu0
+++  ldtmu1
+++  ldtmu0
+++  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
+ +
+-+nop        ; nop ; thrend
+-+nop        ; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+++  mov -,srel(0)
+ +
+-+# mc_interrupt_exit8()
+-+::mc_interrupt_exit8
+-+mov  -, vw_wait # wait on the VDW
+++  nop        ; nop ; thrend
+++  nop        ; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+ +
+-+ldtmu0
+-+ldtmu1
+-+ldtmu0
+-+ldtmu1
+ +
+-+mov -,sacq(0) # 1
+-+mov -,sacq(0) # 2
+-+mov -,sacq(0) # 3
+-+mov -,sacq(0) # 4
+-+mov -,sacq(0) # 5
+-+mov -,sacq(0) # 6
+-+mov -,sacq(0) # 7
+++# mc_interrupt_exit12()
+++::mc_interrupt_exit12
+++  ldtmu0
+++  ldtmu1
+++  ldtmu0
+++  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
+ +
+-+nop        ; nop ; thrend
+-+mov interrupt, 1; nop # delay slot 1
+-+nop        ; nop # delay slot 2
+++  mov -,sacq(0) # 1
+++  mov -,sacq(0) # 2
+++  mov -,sacq(0) # 3
+++  mov -,sacq(0) # 4
+++  mov -,sacq(0) # 5
+++  mov -,sacq(0) # 6
+++  mov -,sacq(0) # 7
+++  mov -,sacq(0) # 8
+++  mov -,sacq(0) # 9
+++  mov -,sacq(0) # 10
+++  mov -,sacq(0) # 11
+ +
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+ +
+ +
+++::mc_exit1
+++  mov  -, vw_wait # wait on the VDW
+ +
+++  ldtmu0
+++  ldtmu1
+++  ldtmu0
+++  ldtmu1
+++  nop        ; nop ; thrend
+++  mov interrupt, 1; nop # delay slot 1
+++  nop        ; nop # delay slot 2
+ +
+ +# LUMA CODE
+ +
+@@ -14010,116 +17210,104 @@ index 0000000..aa9e1e7
+ +################################################################################
+ +# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+ +::mc_setup
+-+  mov r3, 16
+-+
+ +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+-+  mov ra8, unif  # y_x
+-+  mov ra9, unif  # ref_y_base
+-+  mov ra10, unif # y2_x2
+-+  mov ra11, unif # ref_y2_base
+++  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
+++  mov ra9, unif         # ref_y_base
+++  mov ra10, unif        # y2_x2
+++  mov ra11, unif        # ref_y2_base
+ +
+ +# Read image dimensions
+-+  mov r1, unif # width_height
+-+  shl r0,r1,r3
+-+  asr r1,r1,r3 # width
+-+  asr r0,r0,r3 # height
+-+  sub rb_frame_width_minus_1,r1,1
+-+  sub rb_frame_height_minus_1,r0,1
+-+
+-+# get source pitch
+-+  mov rb_pitch, unif # src_pitch
+++  mov ra3, unif         # width_height
+++  mov rb_xpitch, unif   # stride2
+++  sub rb_max_x, ra3.16b, 1
+++  sub rb_max_y, ra3.16a, 1
+++  mov rb_pitch, unif    # stride1
+ +
+ +# get destination pitch
+-+  mov r0, unif       # dst_pitch
+ +  mov r1, vdw_setup_1(0)
+-+  add rb24, r1, r0
+++  or  rb24, r1, rb_pitch
+ +
+ +# Compute base address for first and second access
+-+  mov r1, ra8 # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+++  mov r3, elem_num
+++  add r0, ra8.16a, r3   # Load x + elem_num
+ +  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
+++  min r0, r0, rb_max_x
+ +  shl ra_xshift_next, r0, 3 # Compute shifts
+-+  add ra_y, r1, 1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
+++
+++
+++# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
+++
+++  and r0, r0, -4        ; v8subs r2, r2, r2
+++  sub r2, r2, rb_pitch
+++  and r1, r0, r2
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        # Add stripe offsets
+++  add ra_base, ra9, r0
+++
+++  mov r1, ra8.16b       # Load y
+++  add ra_y, r1, 1       # Set for next
+ +  max r1, r1, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+  add t0s, r2, r1 ; mov ra_frame_base, r2
+-+
+-+  mov r1, ra10 # y_x
+-+  shl r0,r1,r3 # r0 is x<<16
+-+  asr r1,r1,r3 # r1 is y
+-+  asr r0,r0,r3 # r0 is x
+-+  add r0, r0, elem_num # Load x
+++  min r1, r1, rb_max_y
+++
+++# submit texture requests for first line
+++  nop                   ; mul24 r1, r1, rb_pitch
+++  add t0s, ra_base, r1
+++
+++
+++  # r3 still contains elem_num
+++  add r0, ra10.16a, r3  # Load x
+ +  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
+-+  shl rx_xshift2_next, r0, 3 # Compute shifts
+-+  add ra_y2, r1, 1
+-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
+-+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
+++  min r0, r0, rb_max_x
+++  shl rb_xshift2_next, r0, 3 # Compute shifts
+++
+++  # r2 still contains mask
+++  and r0, r0, -4
+++  and r1, r0, r2
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        # Add stripe offsets
+++  add ra_base2, ra11, r0
+++
+++  mov r1, ra10.16b       # Load y
+++  add ra_y2, r1, 1       # Set for next
+ +  max r1, r1, 0
+-+  min r1, r1, rb_frame_height_minus_1
+-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
+-+  add t1s, r2, r1 ; mov ra_frame_base2, r2
+++  min r1, r1, rb_max_y
+ +
+++# submit texture requests for first line
+++  nop                   ; mul24 r1, r1, rb_pitch
+++  add t1s, ra_base2, r1
+ +
+ +# load constants
+ +
+ +  mov ra_k1, 1
+ +  mov ra_k256, 256
+-+  mov ra30, 64
+-+
+-+  mov rb20, 0xffffff00
+ +  mov rb_k255, 255
+-+  mov rb23, 24
+++  mov ra_k0, 0
+ +
+ +# touch vertical context to keep simulator happy
+ +
+-+  mov ra8, 0
+-+  mov ra9, 0
+-+  mov ra10, 0
+-+  mov ra11, 0
+-+  mov ra12, 0
+-+  mov ra13, 0
+-+  mov ra14, 0
+-+  mov ra15, 0
+++  mov ra8,  0           ; mov rb8,  0
+++  mov ra9,  0           ; mov rb9,  0
+++  mov ra10, 0           ; mov rb10, 0
+++  mov ra11, 0           ; mov rb11, 0
+ +
+ +# Compute part of VPM to use
+-+  mov r2, qpu_num
+-+  mov r1, r2
+-+  asr r1, r1, 2
+-+  shl r1, r1, 6
+-+  mov r0, r2
+-+  and r0, r0, 3
+-+  add r0, r0, r1
+-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+  add rb28, r0, r1  # VPM for saving data
+-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+-+  shl r0, r0, 5
+-+  add rb27, r0, r1  # Command for dma output
+++  m_calc_dma_regs rb28, rb27
+ +
+ +# Weighted prediction denom
+-+  add rb13, unif, 9  # unif = weight denom + 6
+-+
+-+  mov -, unif # Unused
+++  add rb13, unif, 9     # unif = weight denom + 6
+ +
+ +# submit texture requests for second line
+ +  max r1, ra_y, 0
+-+  min r1, r1, rb_frame_height_minus_1
+++  min r1, r1, rb_max_y
+ +  add ra_y, ra_y, 1
+-+  nop ; mul24 r1, r1, rb_pitch
+-+  add t0s, r1, ra_frame_base
+++  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
+++  add t0s, r1, ra_base
+ +
+ +  max r1, ra_y2, 0
+-+  min r1, r1, rb_frame_height_minus_1
+++  min r1, r1, rb_max_y
+ +  add ra_y2, ra_y2, 1
+-+  nop ; mul24 r1, r1, rb_pitch
+-+  add t1s, r1, ra_frame_base2
+++  nop                   ; mul24 r1, r1, rb_pitch
+++  add t1s, r1, ra_base2
+ +
+ +# FALL THROUGHT TO PER-BLOCK SETUP
+ +
+@@ -14127,47 +17315,63 @@ index 0000000..aa9e1e7
+ +# P and B blocks share the same setup code to save on Icache space
+ +:per_block_setup
+ +  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov ra31, unif
+++  mov ra_link, unif
+++#### We do all the setup even if we are about to exit - reading junk from unif....
+ +
+-+  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
+++  mov ra1, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +  mov ra_xshift, ra_xshift_next
+-+  mov rx_xshift2, rx_xshift2_next
+++  mov rb_xshift2, rb_xshift2_next
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+ +
+-+  add r0, ra1.16a, r1 # Load x
+++  add r0, ra1.16a, r3   # Load x
+ +  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+  shl ra_xshift_next, r0, 3 # Compute shifts
+-+  mov r3, 8                          ; mov ra_y_next, ra1.16b
+-+  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
+-+  add ra_frame_base_next, r2, r0
+-+
+-+  add r0, ra1.16a, r1 # Load x
+++  min r0, r0, rb_max_x
+++
+++  shl ra_xshift_next, r0, 3         # Compute shifts
+++  and r0, r0, -4        ; v8subs r2, r2, r2
+++  sub r2, r2, rb_pitch
+++  and r1, r0, r2
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        # Add stripe offsets
+++  add ra_base_next, unif, r0              # Base1
+++  mov ra_y_next, ra1.16b                      # Load y
+++  mov ra1, unif         # x2_y2
+++  nop                   # ra1 delay
+++
+++  add r0, ra1.16a, r3   # Load x2
+ +  max r0, r0, 0
+-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
+-+  shl rx_xshift2_next, r0, 3         # Compute shifts
+-+  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
+-+  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
+-+  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
+++  min r0, r0, rb_max_x
+++
+++  shl rb_xshift2_next, r0, 3         # Compute shifts
+++  and r0, r0, -4
+++  and r1, r0, r2
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        # Add stripe offsets
+++  add rb_base2_next, unif, r0              # Base1
+++  mov ra_y2_next, ra1.16b                      # Load y
+++  mov ra_width_height, unif         # width_height
+ +
+ +# set up VPM write
+-+  mov vw_setup, rb28
+++  mov vw_setup, rb28    # [ra1 delay]
+ +
+ +# get width,height of block (unif load above)
+-+  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+-+  add rb17, ra1.16a, 5
+-+  add rb18, ra1.16a, 7
+-+  shl r0,   ra1.16a, 7
+-+  add r0,   r0, ra1.16b # Combine width and height of destination area
+-+  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+++  sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width)
+++  add rb17, ra_height, 5  ; mov r0, ra_height
+++  mov r1, 16
+++  min r0, r0, r1
+++  add rb18, r0, 7
+++  shl r0,   r0, 7
+++  add r0,   r0, ra_width                        # Combine width and height of destination area
+++  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
+ +  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
+ +
+ +# get filter coefficients and discard unused B frame values
+-+  shl.ifz r0, r0, i_shift16      # Pick half to use
+-+  shl ra8, r0, 3
+++  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
+++  mov r2, 0x01040400                 # [ra5 delay]
+++  shl ra8, r0, 3                     ; mov rb14, ra5.16a
+ +
+ +# Pack the 1st 4 filter coefs for H & V tightly
+ +
+@@ -14175,9 +17379,8 @@ index 0000000..aa9e1e7
+ +  ror ra2.8a, r1, ra8.8d
+ +  ror ra0.8a, r1, ra8.8c
+ +
+-+  mov r1,0x01040400
+-+  ror ra2.8b, r1, ra8.8d
+-+  ror ra0.8b, r1, ra8.8c
+++  ror ra2.8b, r2, ra8.8d
+++  ror ra0.8b, r2, ra8.8c
+ +
+ +  mov r1,0x050b0a00  # -ve
+ +  ror ra2.8c, r1, ra8.8d
+@@ -14203,37 +17406,44 @@ index 0000000..aa9e1e7
+ +  ror ra3.8c, r1, ra8.8d
+ +  ror ra1.8c, r1, ra8.8c
+ +
+++  mov r1,0x01010000  # -ve
+++  ror ra3.8d, r1, ra8.8d
+++  ror ra1.8d, r1, ra8.8c
+++
+ +# Extract weighted prediction information in parallel
+++# We are annoyingly A src limited here
+ +
+-+  mov r1,0x01010000  # -ve
+-+  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
+-+  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
+-+
+-+# r3 = 16 from (long way) above
+-+  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
+-+  asr ra18, r0, r3          ; mov rb5, ra3.8b
+-+  bra -, ra31
+-+  shl r0, r0, r3            ; mov rb6, ra3.8c
+-+  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
+-+  asr rb12, r1, 9
+-+
+-+# >>> branch ra31
+++  mov rb4, ra3.8a            ; mov ra18, unif
+++  mov rb5, ra3.8b
+++  mov rb6, ra3.8c
+++  mov.ifnz ra5, ra18
+++
+++  mov rb_dest, unif     # Destination address
+++
+++  bra -, ra_link
+++
+++  shl r0, ra5.16b, rb13      # Offset calc
+++  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+++  mov r3, 0                  ; mov rb7, ra3.8d
+++# >>> branch ra_link
+ +#
+ +# r3 = 0
+-+# ra18 = weight L1
+-+# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
+-+# rb13 = weight denom + 6 + 9
+-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++# ra18.16a = weight L1
+++# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
+++# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++# rb13     = weight denom + 6 + 9
+++# rb14     = weight L0
+ +
+ +
+ +################################################################################
+-+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+ +# In a P block, y2_x2 should be y_x+8
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +
+ +::mc_filter
+-+# r0 = weight << 16; We want weight * 2 in rb14
+-+  asr rb14, r0, 15
+++# ra5.16a = weight << 16; We want weight * 2 in rb14
+++
+++  shl rb14, ra5.16a, 1
+ +
+ +# r3 = 0
+ +
+@@ -14249,20 +17459,20 @@ index 0000000..aa9e1e7
+ +# might be B where y != y2 so we must do full processing on both y and y2
+ +
+ +  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+++  mov.ifz ra_base, ra_base_next ; mov rb31, r3
+ +  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+ +
+ +  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+++  min r2, r2, rb_max_y
+ +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+++  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+ +
+ +  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+++  min r2, r2, rb_max_y
+ +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+@@ -14271,21 +17481,21 @@ index 0000000..aa9e1e7
+ +
+ +# apply horizontal filter
+ +  nop                  ; mul24      r3, ra0.8a,      r0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+ +  sub r0, r2, r3       ; mov r3, rb31
+ +
+ +  sub.setf -, r3, 8       ; mov r1,   ra8
+@@ -14324,18 +17534,48 @@ index 0000000..aa9e1e7
+ +  max vpm, r1, 0         # Delay 3
+ +# >>> branch.anyn yloop
+ +
+-+# DMA out
+++# If looping again the we consumed 16 height last loop
+++  # rb29 (stride) remains constant
+++  # rb17 remains const (based on total height)
+++  # recalc rb26, rb18 based on new segment height
+++  # N.B. r3 is loop counter still
+++
+++  mov r1, 16
+++  sub r0, ra_height, r1
+++  mov ra_height, r0
+++  max.setf r0, r0, 0    # Done if Z now
+ +
+-+  brr -, r:per_block_setup
+++# DMA out
+++  brr.anyz -, r:per_block_setup
+ +  mov vw_setup, rb26 # VDW setup 0    Delay 1
+ +  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, unif # start the VDW   Delay 3
+++  mov vw_addr, rb_dest # start the VDW   Delay 3
+++# >>> .anyz per_block_setup
+++
+++  min r0, r0, r1
+++  add rb18, rb18, r0
+++  sub r0, r0, r1
+++  shl r0, r0, i_shift23
+++  add rb26, rb26, r0
+++
+++  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
+++  add rb_dest, rb_dest, r0
+++
+++  mov vw_setup, rb28    # Reset our VDM write pointer
+++
+++  brr -, r:yloop
+++  nop
+++  nop
+++  nop
+++# >>>
+++
+++
+ +
+ +
+ +
+ +################################################################################
+ +
+-+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+ +# In a P block, only the first half of coefficients contain used information.
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+@@ -14347,7 +17587,7 @@ index 0000000..aa9e1e7
+ +
+ +::mc_filter_b
+ +  # r0 = weightL0 << 16, we want it in rb14
+-+  asr rb14, r0, i_shift16
+++#  asr rb14, r0, i_shift16
+ +
+ +:yloopb
+ +# retrieve texture results and pick out bytes
+@@ -14357,20 +17597,20 @@ index 0000000..aa9e1e7
+ +# Perhaps we could add on the pitch and clip using larger values?
+ +
+ +  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
+-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+++  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+++  mov.ifz ra_base, ra_base_next ; mov rb31, r3
+ +  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+ +
+ +  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+++  min r2, r2, rb_max_y
+ +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
+++  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+ +
+ +  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_frame_height_minus_1
+++  min r2, r2, rb_max_y
+ +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
+++  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+ +
+ +# generate seven shifted versions
+ +# interleave with scroll of vertical context
+@@ -14379,21 +17619,21 @@ index 0000000..aa9e1e7
+ +
+ +# apply horizontal filter
+ +  nop                  ; mul24      r3, ra0.8a,      r0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
+-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
+-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
+-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
+-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
+-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
+-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
+-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
+-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
+-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
+++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+ +  sub r0, r2, r3       ; mov r3, rb31
+ +
+ +  sub.setf -, r3, 8       ; mov r1,   ra8
+@@ -14405,7 +17645,6 @@ index 0000000..aa9e1e7
+ +  # >>> .anyn yloopb
+ +
+ +  # apply vertical filter and write to VPM
+-+
+ +  nop                     ; mul24 r0, rb8,  ra2.8a
+ +  nop                     ; mul24 r1, rb9,  ra2.8b
+ +  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+@@ -14421,558 +17660,174 @@ index 0000000..aa9e1e7
+ +
+ +  asr r1, r1, 14
+ +  nop                     ; mul24 r0, r1, rb14
+-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
+-+
+-+  add r1, r1, r0          ; mov -, vw_wait
+-+  shl r1, r1, 8
+-+
+-+  brr.anyn -, r:yloopb
+-+  asr r1, r1, rb13         # Delay 1
+-+  min r1, r1, rb_k255       # Delay 2
+-+  max vpm, r1, 0         # Delay 3
+-+
+-+# DMA out
+-+  brr -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, unif # start the VDW   Delay 3
+-+
+-+################################################################################
+-+
+-+# mc_interrupt_exit12()
+-+::mc_interrupt_exit12
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  # Dummy wait to test instructions
+-+#  mov r3,1000000
+-+#:dummy_loop
+-+#  sub.setf r3, r3, 1
+-+#  nop
+-+#  nop
+-+#  brr.anynn -, r:dummy_loop
+-+#  nop
+-+#  nop
+-+#  nop
+-+
+-+  ldtmu0
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu1
+-+
+-+  mov -,sacq(0) # 1
+-+  mov -,sacq(0) # 2
+-+  mov -,sacq(0) # 3
+-+  mov -,sacq(0) # 4
+-+  mov -,sacq(0) # 5
+-+  mov -,sacq(0) # 6
+-+  mov -,sacq(0) # 7
+-+  mov -,sacq(0) # 8
+-+  mov -,sacq(0) # 9
+-+  mov -,sacq(0) # 10
+-+  mov -,sacq(0) # 11
+-+
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+-+
+-+
+-+::mc_exit1
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  ldtmu1
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+++  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
+ +
+++  add r1, r1, r0          ; mov -, vw_wait
+++  shl r1, r1, 8
+ +
+-+::mc_end
+-+# Do not add code here because mc_end must appear after all other code.
+-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+-new file mode 100644
+-index 0000000..db41a4d
+---- /dev/null
+-+++ b/libavcodec/rpi_user_vcsm.h
+-@@ -0,0 +1,459 @@
+-+/*****************************************************************************
+-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
+-+*
+-+* This program is the proprietary software of Broadcom Corporation and/or
+-+* its licensors, and may only be used, duplicated, modified or distributed
+-+* pursuant to the terms and conditions of a separate, written license
+-+* agreement executed between you and Broadcom (an "Authorized License").
+-+* Except as set forth in an Authorized License, Broadcom grants no license
+-+* (express or implied), right to use, or waiver of any kind with respect to
+-+* the Software, and Broadcom expressly reserves all rights in and to the
+-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
+-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
+-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
+-+* THE SOFTWARE.
+-+*
+-+* Except as expressly set forth in the Authorized License,
+-+* 1. This program, including its structure, sequence and organization,
+-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
+-+*    all reasonable efforts to protect the confidentiality thereof, and to
+-+*    use this information only in connection with your use of Broadcom
+-+*    integrated circuit products.
+-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
+-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
+-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
+-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
+-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
+-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
+-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
+-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
+-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
+-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
+-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
+-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
+-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
+-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
+-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
+-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
+-+*****************************************************************************/
+-+
+-+#ifndef __USER_VCSM__H__INCLUDED__
+-+#define __USER_VCSM__H__INCLUDED__
+-+
+-+/* VideoCore Shared Memory - user interface library.
+-+**
+-+** This library provides all the necessary abstraction for any application to
+-+** make use of the shared memory service which is distributed accross a kernel
+-+** driver and a videocore service.
+-+**
+-+** It is an application design decision to choose or not to use this service.
+-+**
+-+** The logical flow of operations that a user application needs to follow when
+-+** using this service is:
+-+**
+-+**       1) Initialize the service.
+-+**       2) Allocate shared memory blocks.
+-+**       3) Start using the allocated blocks.
+-+**          - In order to gain ownership on a block, lock the allocated block,
+-+**            locking a block returns a valid address that the user application
+-+**            can access.
+-+**          - When finished with using the block for the current execution cycle
+-+**            or function, and so when giving up the ownership, unlock the block.
+-+**       4) A block can be locked/unlocked as many times required - within or outside
+-+**          of - a specific execution context.
+-+**       5) To completely release an allocated block, free it.
+-+**       6) If the service is no longer required, terminate it.
+-+**
+-+**
+-+** Some generic considerations:
+-+
+-+** Allocating memory blocks.
+-+**
+-+**   Memory blocks can be allocated in different manners depending on the cache
+-+**   behavior desired.  A given block can either be:
+-+
+-+**       - Allocated in a non cached fashion all the way through host and videocore.
+-+**       - Allocated in a cached fashion on host OR videocore.
+-+**       - Allocated in a cached fashion on host AND videocore.
+-+**
+-+**   It is an application decision to determine how to allocate a block.  Evidently
+-+**   if the application will be doing substantial read/write accesses to a given block,
+-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
+-+**   better results.
+-+**
+-+**
+-+** Locking memory blocks.
+-+**
+-+**   When the memory block has been allocated in a host cached fashion, locking the
+-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
+-+**
+-+**   For the above reason and when using host cached allocation, it is important that
+-+**   an application properly implements the lock/unlock mechanism to ensure cache will
+-+**   stay coherent, otherwise there is no guarantee it will at all be.
+-+**
+-+**   It is possible to dynamically change the host cache behavior (ie cached or non
+-+**   cached) of a given allocation without needing to free and re-allocate the block.
+-+**   This feature can be useful for such application which requires access to the block
+-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
+-+**   the application can optimize performances for a given duration of use.
+-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
+-+**   cache.  If one requires to change the videocore cache behavior, then a new block
+-+**   must be created to replace the old one.
+-+**
+-+**   On successful locking, a valid pointer is returned that the application can use
+-+**   to access to data inside the block.  There is no guarantee that the pointer will
+-+**   stay valid following the unlock action corresponding to this lock.
+-+**
+-+**
+-+** Unocking memory blocks.
+-+**
+-+**   When the memory block has been allocated in a host cached fashion, unlocking the
+-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
+-+**   explicitely asked not to flush the cache for performances reasons.
+-+**
+-+**   For the above reason and when using host cached allocation, it is important that
+-+**   an application properly implements the lock/unlock mechanism to ensure cache will
+-+**   stay coherent, otherwise there is no guarantee it will at all be.
+-+**
+-+**
+-+** A complete API is defined below.
+-+*/
+++  brr.anyn -, r:yloopb
+++  asr r1, r1, rb13         # Delay 1
+++  min r1, r1, rb_k255       # Delay 2
+++  max vpm, r1, 0         # Delay 3
+ +
+-+#ifdef __cplusplus
+-+extern "C"
+-+{
+-+#endif
+ +
+-+/* Different status that can be dumped.
+-+*/
+-+typedef enum
+-+{
+-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
+-+                                    // Result of the walk is seen in the videocore
+-+                                    // log.
+-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
+-+                                    // driver (ie for all processes).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
+-+                                    // driver (for current process).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
+-+                                    // driver (for current process).  Result of
+-+                                    // the walk is seen in the kernel log.
+-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
+-+                                    // VCSM_STATUS_HOST_WALK_MAP.
+-+                                    //
+-+   VCSM_STATUS_NONE,                // Must be last - invalid.
+-+
+-+} VCSM_STATUS_T;
+-+
+-+/* Different kind of cache behavior.
+-+*/
+-+typedef enum
+-+{
+-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
+-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
+-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
+-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
+-+
+-+} VCSM_CACHE_TYPE_T;
+-+
+-+/* Initialize the vcsm processing.
+-+**
+-+** Must be called once before attempting to do anything else.
+-+**
+-+** Returns 0 on success, -1 on error.
+-+*/
+-+int vcsm_init( void );
+++# If looping again the we consumed 16 height last loop
+++  # rb29 (stride) remains constant
+++  # rb17 remains const (based on total height)
+++  # recalc rb26, rb18 based on new segment height
+++  # N.B. r3 is loop counter still
+ +
+++  mov r1, 16
+++  sub r0, ra_height, r1
+++  mov ra_height, r0
+++  max.setf r0, r0, 0    # Done if Z now
+ +
+-+/* Terminates the vcsm processing.
+-+**
+-+** Must be called vcsm services are no longer needed, it will
+-+** take care of removing any allocation under the current process
+-+** control if deemed necessary.
+-+*/
+-+void vcsm_exit( void );
+++# DMA out
+++  brr.anyz -, r:per_block_setup
+++  mov vw_setup, rb26 # VDW setup 0    Delay 1
+++  mov vw_setup, rb29 # Stride         Delay 2
+++  mov vw_addr, rb_dest # start the VDW   Delay 3
+++# >>> .anyz per_block_setup
+ +
+++  min r0, r0, r1
+++  add rb18, rb18, r0
+++  sub r0, r0, r1
+++  shl r0, r0, i_shift23
+++  add rb26, rb26, r0
+ +
+-+/* Queries the status of the the vcsm.
+-+**
+-+** Triggers dump of various kind of information, see the
+-+** different variants specified in VCSM_STATUS_T.
+-+**
+-+** Pid is optional.
+-+*/
+-+void vcsm_status( VCSM_STATUS_T status, int pid );
+-+
+-+
+-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
+-+** allocator.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc( unsigned int size, char *name );
+-+
+-+
+-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
+-+** allocator, the type of caching requested is passed as argument of the
+-+** function call.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
+-+
+-+
+-+/* Shares an allocated block of memory via the vcsm memory allocator.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** On success, the user must invoke vcsm_lock with the returned opaque
+-+** handle to gain access to the memory associated with the opaque handle.
+-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
+-+** function definition for more details on the one that can be used).
+-+**
+-+** A well behaved application should make every attempt to lock/unlock
+-+** only for the duration it needs to access the memory data associated with
+-+** the opaque handle.
+-+*/
+-+unsigned int vcsm_malloc_share( unsigned int handle );
+-+
+-+
+-+/* Resizes a block of memory allocated previously by vcsm_alloc.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** The handle must be unlocked by user prior to attempting any
+-+** resize action.
+-+**
+-+** On error, the original size allocated against the handle
+-+** remains available the same way it would be following a
+-+** successful vcsm_malloc.
+-+*/
+-+int vcsm_resize( unsigned int handle, unsigned int new_size );
+-+
+-+
+-+/* Frees a block of memory that was successfully allocated by
+-+** a prior call the vcms_alloc.
+-+**
+-+** The handle should be considered invalid upon return from this
+-+** call.
+-+**
+-+** Whether any memory is actually freed up or not as the result of
+-+** this call will depends on many factors, if all goes well it will
+-+** be freed.  If something goes wrong, the memory will likely end up
+-+** being freed up as part of the vcsm_exit process.  In the end the
+-+** memory is guaranteed to be freed one way or another.
+-+*/
+-+void vcsm_free( unsigned int handle );
+-+
+-+
+-+/* Retrieves a videocore opaque handle from a mapped user address
+-+** pointer.  The videocore handle will correspond to the actual
+-+** memory mapped in videocore.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** Note: the videocore opaque handle is distinct from the user
+-+**       opaque handle (allocated via vcsm_malloc) and it is only
+-+**       significant for such application which knows what to do
+-+**       with it, for the others it is just a number with little
+-+**       use since nothing can be done with it (in particular
+-+**       for safety reason it cannot be used to map anything).
+-+*/
+-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
+-+
+-+
+-+/* Retrieves a videocore opaque handle from a opaque handle
+-+** pointer.  The videocore handle will correspond to the actual
+-+** memory mapped in videocore.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+**
+-+** Note: the videocore opaque handle is distinct from the user
+-+**       opaque handle (allocated via vcsm_malloc) and it is only
+-+**       significant for such application which knows what to do
+-+**       with it, for the others it is just a number with little
+-+**       use since nothing can be done with it (in particular
+-+**       for safety reason it cannot be used to map anything).
+-+*/
+-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
+++  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
+++  add rb_dest, rb_dest, r0
+ +
+++  mov vw_setup, rb28    # Reset our VDM write pointer
+ +
+-+/* Retrieves a user opaque handle from a mapped user address
+-+** pointer.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero opaque handle on success.
+-+*/
+-+unsigned int vcsm_usr_handle( void *usr_ptr );
+-+
+-+
+-+/* Retrieves a mapped user address from an opaque user
+-+** handle.
+-+**
+-+** Returns:        0 on error
+-+**                 a non-zero address on success.
+-+**
+-+** On success, the address corresponds to the pointer
+-+** which can access the data allocated via the vcsm_malloc
+-+** call.
+-+*/
+-+void *vcsm_usr_address( unsigned int handle );
+-+
+-+
+-+/* Locks the memory associated with this opaque handle.
+-+**
+-+** Returns:        NULL on error
+-+**                 a valid pointer on success.
+-+**
+-+** A user MUST lock the handle received from vcsm_malloc
+-+** in order to be able to use the memory associated with it.
+-+**
+-+** On success, the pointer returned is only valid within
+-+** the lock content (ie until a corresponding vcsm_unlock_xx
+-+** is invoked).
+-+*/
+-+void *vcsm_lock( unsigned int handle );
+-+
+-+
+-+/* Locks the memory associated with this opaque handle.  The lock
+-+** also gives a chance to update the *host* cache behavior of the
+-+** allocated buffer if so desired.  The *videocore* cache behavior
+-+** of the allocated buffer cannot be changed by this call and such
+-+** attempt will be ignored.
+-+**
+-+** The system will attempt to honour the cache_update mode request,
+-+** the cache_result mode will provide the final answer on which cache
+-+** mode is really in use.  Failing to change the cache mode will not
+-+** result in a failure to lock the buffer as it is an application
+-+** decision to choose what to do if (cache_result != cache_update)
+-+**
+-+** The value returned in cache_result can only be considered valid if
+-+** the returned pointer is non NULL.  The cache_result pointer may be
+-+** NULL if the application does not care about the actual outcome of
+-+** its action with regards to the cache behavior change.
+-+**
+-+** Returns:        NULL on error
+-+**                 a valid pointer on success.
+-+**
+-+** A user MUST lock the handle received from vcsm_malloc
+-+** in order to be able to use the memory associated with it.
+-+**
+-+** On success, the pointer returned is only valid within
+-+** the lock content (ie until a corresponding vcsm_unlock_xx
+-+** is invoked).
+-+*/
+-+void *vcsm_lock_cache( unsigned int handle,
+-+                       VCSM_CACHE_TYPE_T cache_update,
+-+                       VCSM_CACHE_TYPE_T *cache_result );
+-+
+-+
+-+/* Unlocks the memory associated with this user mapped address.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking a mapped address, the user should no longer
+-+** attempt to reference it.
+-+*/
+-+int vcsm_unlock_ptr( void *usr_ptr );
+-+
+-+
+-+/* Unlocks the memory associated with this user mapped address.
+-+** Apply special processing that would override the otherwise
+-+** default behavior.
+-+**
+-+** If 'cache_no_flush' is specified:
+-+**    Do not flush cache as the result of the unlock (if cache
+-+**    flush was otherwise applicable in this case).
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking a mapped address, the user should no longer
+-+** attempt to reference it.
+-+*/
+-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
+++  brr -, r:yloopb
+++  nop
+++  nop
+++  nop
+ +
+++################################################################################
+ +
+-+/* Unlocks the memory associated with this user opaque handle.
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking an opaque handle, the user should no longer
+-+** attempt to reference the mapped addressed once associated
+-+** with it.
+-+*/
+-+int vcsm_unlock_hdl( unsigned int handle );
+-+
+-+
+-+/* Unlocks the memory associated with this user opaque handle.
+-+** Apply special processing that would override the otherwise
+-+** default behavior.
+-+**
+-+** If 'cache_no_flush' is specified:
+-+**    Do not flush cache as the result of the unlock (if cache
+-+**    flush was otherwise applicable in this case).
+-+**
+-+** Returns:        0 on success
+-+**                 -errno on error.
+-+**
+-+** After unlocking an opaque handle, the user should no longer
+-+** attempt to reference the mapped addressed once associated
+-+** with it.
+-+*/
+-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
+-+
+-+/* Clean and/or invalidate the memory associated with this user opaque handle
+-+**
+-+** Returns:        non-zero on error
+-+**
+-+** structure contains a list of flush/invalidate commands. Commands are:
+-+** 0: nop
+-+** 1: invalidate       given virtual range in L1/L2
+-+** 2: clean            given virtual range in L1/L2
+-+** 3: clean+invalidate given virtual range in L1/L2
+-+** 4: flush all L1/L2
+-+*/
+-+struct vcsm_user_clean_invalid_s {
+-+   struct {
+-+      unsigned int cmd;
+-+      unsigned int handle;
+-+      unsigned int addr;
+-+      unsigned int size;
+-+   } s[8];
+-+};
+++::mc_end
+++# Do not add code here because mc_end must appear after all other code.
++diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
++new file mode 100644
++index 0000000..27cbb59
++--- /dev/null
+++++ b/libavcodec/rpi_shader_cmd.h
++@@ -0,0 +1,88 @@
+++#ifndef RPI_SHADER_CMD_H
+++#define RPI_SHADER_CMD_H
+++
+++#pragma pack(push, 4)
+++
+++typedef struct qpu_mc_pred_c_s {
+++    uint32_t next_fn;
+++    int16_t next_src_y;
+++    int16_t next_src_x;
+++    uint32_t next_src_base_c;
+++    union {
+++        struct {
+++            uint16_t h;
+++            uint16_t w;
+++            uint32_t coeffs_x;
+++            uint32_t coeffs_y;
+++            uint32_t wo_u;
+++            uint32_t wo_v;
+++            uint32_t dst_addr_c;
+++        } p;
+++        struct {
+++            uint16_t h;
+++            uint16_t w;
+++            uint32_t coeffs_x;
+++            uint32_t coeffs_y;
+++            uint32_t weight_u;
+++            uint32_t weight_v;
+++            uint32_t dummy0;
+++        } b0;
+++        struct {
+++            uint32_t dummy0;
+++            uint32_t coeffs_x;
+++            uint32_t coeffs_y;
+++            uint32_t wo_u;
+++            uint32_t wo_v;
+++            uint32_t dst_addr_c;
+++        } b1;
+++        struct {
+++            uint32_t pic_cw;            // C Width (== Y width / 2)
+++            uint32_t pic_ch;            // C Height (== Y Height / 2)
+++            uint32_t stride2;
+++            uint32_t stride1;
+++            uint32_t wdenom;
+++            uint32_t dummy0;
+++        } s0;
+++        struct {
+++            uint32_t dummy0;
+++            uint32_t dummy1;
+++            uint32_t dummy2;
+++            uint32_t dummy3;
+++            uint32_t dummy4;
+++            uint32_t dummy5;
+++        } s1;
+++    };
+++} qpu_mc_pred_c_t;
+++
+++typedef struct qpu_mc_pred_y_s {
+++    int16_t next_src1_x;
+++    int16_t next_src1_y;
+++    uint32_t next_src1_base;
+++    int16_t next_src2_x;
+++    int16_t next_src2_y;
+++    uint32_t next_src2_base;
+++    union {
+++        struct {
+++            uint16_t h;
+++            uint16_t w;
+++            uint32_t mymx21;
+++            uint32_t wo1;
+++            uint32_t wo2;
+++            uint32_t dst_addr;
+++        } p;
+++        struct {
+++            uint16_t pic_h;
+++            uint16_t pic_w;
+++            uint32_t stride2;
+++            uint32_t stride1;
+++            uint32_t wdenom;
+++            uint32_t dummy0;
+++        } s;
+++    };
+++    uint32_t next_fn;
+++} qpu_mc_pred_y_t;
+ +
+-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
+++#pragma pack(pop)
+ +
+-+#ifdef __cplusplus
+-+}
+ +#endif
+ +
+-+#endif /* __USER_VCSM__H__INCLUDED__ */
+ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+ new file mode 100644
+-index 0000000..9580165
++index 0000000..b061fe0
+ --- /dev/null
+ +++ b/libavcodec/rpi_zc.c
+-@@ -0,0 +1,406 @@
++@@ -0,0 +1,581 @@
+ +#include "config.h"
+ +#ifdef RPI
+ +#include "rpi_qpu.h"
+++#include "rpi_mailbox.h"
+ +#include "rpi_zc.h"
+++#include "libavutil/avassert.h"
+++#include <pthread.h>
+ +
+ +#include "libavutil/buffer_internal.h"
+++#include <interface/vctypes/vc_image_types.h>
+++
+++#define TRACE_ALLOC 0
+ +
+ +struct ZcPoolEnt;
+ +
+ +typedef struct ZcPool
+ +{
+ +    int numbytes;
+++    unsigned int n;
+ +    struct ZcPoolEnt * head;
+ +    pthread_mutex_t lock;
+ +} ZcPool;
+@@ -14981,27 +17836,56 @@ index 0000000..9580165
+ +{
+ +    // It is important that we start with gmem as other bits of code will expect to see that
+ +    GPU_MEM_PTR_T gmem;
+++    unsigned int n;
+ +    struct ZcPoolEnt * next;
+ +    struct ZcPool * pool;
+ +} ZcPoolEnt;
+ +
+-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
+++#if 1
+++//#define ALLOC_PAD       0x1000
+++#define ALLOC_PAD       0
+++#define ALLOC_ROUND     0x1000
+++//#define ALLOC_N_OFFSET  0x100
+++#define ALLOC_N_OFFSET  0
+++#define STRIDE_ROUND    0x80
+++#define STRIDE_OR       0x80
+++#else
+++#define ALLOC_PAD       0
+++#define ALLOC_ROUND     0x1000
+++#define ALLOC_N_OFFSET  0
+++#define STRIDE_ROUND    32
+++#define STRIDE_OR       0
+++#endif
+++
+++#define DEBUG_ZAP0_BUFFERS 0
+++
+++
+++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
+ +{
+ +    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
+ +
+++    // Round up to 4k & add 4k
+++    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
+++
+ +    if (zp == NULL) {
+ +        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
+ +        goto fail0;
+ +    }
+ +
+-+    if (gpu_malloc_cached(size, &zp->gmem) != 0)
+++    if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
+ +    {
+-+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
+++        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
+ +        goto fail1;
+ +    }
+ +
+++#if TRACE_ALLOC
+++    printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
+++#endif
+++
+++    pool->numbytes = zp->gmem.numbytes;
+ +    zp->next = NULL;
+ +    zp->pool = pool;
+++    zp->n = pool->n++;
+ +    return zp;
+ +
+ +fail1:
+@@ -15012,6 +17896,10 @@ index 0000000..9580165
+ +
+ +static void zc_pool_ent_free(ZcPoolEnt * const zp)
+ +{
+++#if TRACE_ALLOC
+++    printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
+++#endif
+++
+ +    gpu_free(&zp->gmem);
+ +    av_free(zp);
+ +}
+@@ -15020,6 +17908,8 @@ index 0000000..9580165
+ +{
+ +    ZcPoolEnt * p = pool->head;
+ +    pool->head = NULL;
+++    pool->numbytes = -1;
+++
+ +    while (p != NULL)
+ +    {
+ +        ZcPoolEnt * const zp = p;
+@@ -15028,15 +17918,21 @@ index 0000000..9580165
+ +    }
+ +}
+ +
+-+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes)
+++static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
+ +{
+ +    ZcPoolEnt * zp;
+++    int numbytes;
+++
+ +    pthread_mutex_lock(&pool->lock);
+ +
+-+    if (numbytes != pool->numbytes)
+++    numbytes = pool->numbytes;
+++
+++    // If size isn't close then dump the pool
+++    // Close in this context means within 128k
+++    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
+ +    {
+ +        zc_pool_flush(pool);
+-+        pool->numbytes = numbytes;
+++        numbytes = req_bytes;
+ +    }
+ +
+ +    if (pool->head != NULL)
+@@ -15050,6 +17946,10 @@ index 0000000..9580165
+ +    }
+ +
+ +    pthread_mutex_unlock(&pool->lock);
+++
+++    // Start with our buffer empty of preconceptions
+++//    rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE);
+++
+ +    return zp;
+ +}
+ +
+@@ -15059,6 +17959,10 @@ index 0000000..9580165
+ +    if (zp != NULL)
+ +    {
+ +        pthread_mutex_lock(&pool->lock);
+++#if TRACE_ALLOC
+++        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
+++#endif
+++
+ +        if (pool->numbytes == zp->gmem.numbytes)
+ +        {
+ +            zp->next = pool->head;
+@@ -15089,10 +17993,18 @@ index 0000000..9580165
+ +    pthread_mutex_destroy(&pool->lock);
+ +}
+ +
+++typedef struct ZcOldCtxVals
+++{
+++    int thread_safe_callbacks;
+++    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
+++    void * get_buffer_context;
+++} ZcOldCtxVals;
+ +
+ +typedef struct AVZcEnv
+ +{
+++    unsigned int refcount;
+ +    ZcPool pool;
+++    ZcOldCtxVals old;
+ +} ZcEnv;
+ +
+ +// Callback when buffer unrefed to zero
+@@ -15112,28 +18024,94 @@ index 0000000..9580165
+ +}
+ +
+ +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+-+    const unsigned int video_width, const unsigned int video_height)
+++    const int format, const unsigned int video_width, const unsigned int video_height)
+ +{
+ +    AVRpiZcFrameGeometry geo;
+-+    geo.stride_y = (video_width + 32 + 31) & ~31;
+-+    geo.stride_c = geo.stride_y / 2;
+-+//    geo.height_y = (video_height + 15) & ~15;
+-+    geo.height_y = (video_height + 32 + 31) & ~31;
+-+    geo.height_c = geo.height_y / 2;
+++
+++    switch (format)
+++    {
+++        case AV_PIX_FMT_YUV420P:
+++            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+++        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
+++            geo.stride_c = geo.stride_y / 2;
+++        //    geo.height_y = (video_height + 15) & ~15;
+++            geo.height_y = (video_height + 32 + 31) & ~31;
+++            geo.height_c = geo.height_y / 2;
+++            geo.planes_c = 2;
+++            geo.stripes = 1;
+++            break;
+++
+++        case AV_PIX_FMT_SAND128:
+++        {
+++            const unsigned int stripe_w = 128;
+++
+++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+++            static VC_IMAGE_T img = {0};
+++
+++            // Given the overhead of calling the mailbox keep a stashed
+++            // copy as we will almost certainly just want the same numbers again
+++            // but that means we need a lock
+++            pthread_mutex_lock(&sand_lock);
+++
+++            if (img.width != video_width || img.height != video_height)
+++            {
+++                VC_IMAGE_T new_img = {
+++                    .type = VC_IMAGE_YUV_UV,
+++                    .width = video_width,
+++                    .height = video_height
+++                };
+++
+++                gpu_ref();
+++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
+++                gpu_unref();
+++                img = new_img;
+++            }
+++
+++            geo.stride_y = stripe_w;
+++            geo.stride_c = stripe_w;
+++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+++            geo.height_c = img.pitch / stripe_w - geo.height_y;
+++            geo.planes_c = 1;
+++            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
+++
+++            pthread_mutex_unlock(&sand_lock);
+++
+++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
+++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
+++            break;
+++        }
+++
+++        default:
+++            memset(&geo, 0, sizeof(geo));
+++            break;
+++    }
+ +    return geo;
+ +}
+ +
+++
+ +static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
+ +{
+ +    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
+ +    AVBufferRef * buf;
+++    intptr_t idata = (intptr_t)zp->gmem.arm;
+++#if ALLOC_N_OFFSET != 0
+++    intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
+++#endif
+ +
+ +    if (zp == NULL) {
+ +        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
+ +        goto fail0;
+ +    }
+ +
+-+    if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+++#if ALLOC_N_OFFSET != 0
+++    idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
+++#endif
+++
+++#if DEBUG_ZAP0_BUFFERS
+++    memset((void*)idata, 0, size);
+++#endif
+++
+++    if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+ +    {
+ +        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
+ +        goto fail2;
+@@ -15147,13 +18125,12 @@ index 0000000..9580165
+ +    return NULL;
+ +}
+ +
+-+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame)
+++static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
+ +{
+-+    ZcEnv *const zc = s->get_buffer_context;
+-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height);
+++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
+ +    const unsigned int size_y = geo.stride_y * geo.height_y;
+ +    const unsigned int size_c = geo.stride_c * geo.height_c;
+-+    const unsigned int size_pic = size_y + size_c * 2;
+++    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
+ +    AVBufferRef * buf;
+ +    unsigned int i;
+ +
+@@ -15161,7 +18138,7 @@ index 0000000..9580165
+ +
+ +    if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
+ +    {
+-+        av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+++        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
+ +        return AVERROR(ENOMEM);
+ +    }
+ +
+@@ -15172,19 +18149,24 @@ index 0000000..9580165
+ +    }
+ +
+ +    frame->buf[0] = buf;
+++
+ +    frame->linesize[0] = geo.stride_y;
+ +    frame->linesize[1] = geo.stride_c;
+ +    frame->linesize[2] = geo.stride_c;
+++    if (geo.stripes > 1)
+++        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
+++
+ +    frame->data[0] = buf->data;
+ +    frame->data[1] = frame->data[0] + size_y;
+-+    frame->data[2] = frame->data[1] + size_c;
+++    if (geo.planes_c > 1)
+++        frame->data[2] = frame->data[1] + size_c;
+++
+ +    frame->extended_data = frame->data;
+ +    // Leave extended buf alone
+ +
+ +    return 0;
+ +}
+ +
+-+
+ +#define RPI_GET_BUFFER2 1
+ +
+ +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
+@@ -15194,21 +18176,25 @@ index 0000000..9580165
+ +#else
+ +    int rv;
+ +
+-+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 ||
+-+        frame->format != AV_PIX_FMT_YUV420P)
+++    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
+ +    {
+ +//        printf("Do default alloc: format=%#x\n", frame->format);
+ +        rv = avcodec_default_get_buffer2(s, frame, flags);
+ +    }
+++    else if (frame->format == AV_PIX_FMT_YUV420P ||
+++             frame->format == AV_PIX_FMT_SAND128)
+++    {
+++        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
+++    }
+ +    else
+ +    {
+-+        rv = rpi_get_display_buffer(s, frame);
+++        rv = avcodec_default_get_buffer2(s, frame, flags);
+ +    }
+ +
+ +#if 0
+-+    printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+-+        frame->width, frame->height,
+-+        frame->linesize[0], frame->linesize[1], frame->linesize[2],
+++    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
+++        frame->format, frame->width, frame->height,
+++        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
+ +        frame->data[0], frame->data[1], frame->data[2],
+ +        frame->buf[0], frame->buf[1], frame->buf[2],
+ +        av_buffer_get_opaque(frame->buf[0]));
+@@ -15229,7 +18215,7 @@ index 0000000..9580165
+ +    dest->width = src->width;
+ +    dest->height = src->height;
+ +
+-+    if (rpi_get_display_buffer(s, dest) != 0)
+++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+ +    {
+ +        return NULL;
+ +    }
+@@ -15262,14 +18248,16 @@ index 0000000..9580165
+ +{
+ +    assert(s != NULL);
+ +
+-+    if (frame->format != AV_PIX_FMT_YUV420P)
+++    if (frame->format != AV_PIX_FMT_YUV420P &&
+++        frame->format != AV_PIX_FMT_SAND128)
+ +    {
+-+        av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format);
+++        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
+ +        return NULL;
+ +    }
+ +
+ +    if (frame->buf[1] != NULL)
+ +    {
+++        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
+ +        if (maycopy)
+ +        {
+ +            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+@@ -15305,6 +18293,18 @@ index 0000000..9580165
+ +    return p == NULL ? -1 : p->vc_handle;
+ +}
+ +
+++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
+++{
+++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+++    return p == NULL ? 0 : fr_ref->data - p->arm;
+++}
+++
+++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
+++{
+++    return fr_ref == NULL ? 0 : fr_ref->size;
+++}
+++
+++
+ +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
+ +{
+ +    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
+@@ -15341,27 +18341,50 @@ index 0000000..9580165
+ +    }
+ +}
+ +
+++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
+++{
+++    return s->get_buffer2 == av_rpi_zc_get_buffer2;
+++}
+++
+ +int av_rpi_zc_init(struct AVCodecContext * const s)
+ +{
+-+    ZcEnv * const zc = av_rpi_zc_env_alloc();
+-+    if (zc == NULL)
+++    if (av_rpi_zc_in_use(s))
+ +    {
+-+        return AVERROR(ENOMEM);
+++        ZcEnv * const zc = s->get_buffer_context;
+++        ++zc->refcount;
+ +    }
+++    else
+++    {
+++        ZcEnv *const zc = av_rpi_zc_env_alloc();
+++        if (zc == NULL)
+++        {
+++            return AVERROR(ENOMEM);
+++        }
+++
+++        zc->refcount = 1;
+++        zc->old.get_buffer_context = s->get_buffer_context;
+++        zc->old.get_buffer2 = s->get_buffer2;
+++        zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
+ +
+-+    s->get_buffer_context = zc;
+-+    s->get_buffer2 = av_rpi_zc_get_buffer2;
+++        s->get_buffer_context = zc;
+++        s->get_buffer2 = av_rpi_zc_get_buffer2;
+++        s->thread_safe_callbacks = 1;
+++    }
+ +    return 0;
+ +}
+ +
+ +void av_rpi_zc_uninit(struct AVCodecContext * const s)
+ +{
+-+    if (s->get_buffer2 == av_rpi_zc_get_buffer2)
+++    if (av_rpi_zc_in_use(s))
+ +    {
+ +        ZcEnv * const zc = s->get_buffer_context;
+-+        s->get_buffer2 = avcodec_default_get_buffer2;
+-+        s->get_buffer_context = NULL;
+-+        av_rpi_zc_env_free(zc);
+++        if (--zc->refcount == 0)
+++        {
+++            s->get_buffer2 = zc->old.get_buffer2;
+++            s->get_buffer_context = zc->old.get_buffer_context;
+++            s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
+++            av_rpi_zc_env_free(zc);
+++        }
+ +    }
+ +}
+ +
+@@ -15369,19 +18392,19 @@ index 0000000..9580165
+ +
+ diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
+ new file mode 100644
+-index 0000000..f0109f4
++index 0000000..f4aeb78
+ --- /dev/null
+ +++ b/libavcodec/rpi_zc.h
+-@@ -0,0 +1,83 @@
++@@ -0,0 +1,137 @@
+ +#ifndef LIBAVCODEC_RPI_ZC_H
+ +#define LIBAVCODEC_RPI_ZC_H
+ +
+ +// Zero-Copy frame code for RPi
+ +// RPi needs Y/U/V planes to be contiguous for display.  By default
+ +// ffmpeg will allocate separated planes so a memcpy is needed before
+-+// display.  This code prodes a method a making ffmpeg allocate a single
+-+// bit of memory for the frame when can then be refrence counted until
+-+// display ahs finsihed with it.
+++// display.  This code provides a method a making ffmpeg allocate a single
+++// bit of memory for the frame when can then be reference counted until
+++// display has finished with it.
+ +
+ +#include "libavutil/frame.h"
+ +#include "libavcodec/avcodec.h"
+@@ -15398,10 +18421,13 @@ index 0000000..f0109f4
+ +    unsigned int height_y;
+ +    unsigned int stride_c;
+ +    unsigned int height_c;
+++    unsigned int planes_c;
+++    unsigned int stripes;
+ +} AVRpiZcFrameGeometry;
+ +
+ +
+ +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
+++    const int format,
+ +    const unsigned int video_width, const unsigned int video_height);
+ +
+ +// Replacement fn for avctx->get_buffer2
+@@ -15410,7 +18436,7 @@ index 0000000..f0109f4
+ +// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
+ +// must be set to 1 as otherwise the buffer info is killed before being returned
+ +// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
+-+// return must be manually derefed with av_frame_unref.  This should be done
+++// returned must be manually derefed with av_frame_unref.  This should be done
+ +// after av_rpi_zc_ref has been called.
+ +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
+ +
+@@ -15427,6 +18453,11 @@ index 0000000..f0109f4
+ +// Get the vc_handle from the frame ref
+ +// Returns -1 if ref doesn't look valid
+ +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
+++// Get offset from the start of the memory referenced
+++// by the vc_handle to valid data
+++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
+++// Length of buffer data
+++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
+ +// Get the number of bytes allocated from the frame ref
+ +// Returns 0 if ref doesn't look valid
+ +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
+@@ -15443,6 +18474,8 @@ index 0000000..f0109f4
+ +// Allocate the environment used by the ZC code
+ +void av_rpi_zc_env_free(AVZcEnvPtr);
+ +
+++// Test to see if the context is using zc (checks get_buffer2)
+++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+ +
+ +// Init ZC into a context
+ +// There is nothing magic in this fn - it just packages setting
+@@ -15454,10 +18487,54 @@ index 0000000..f0109f4
+ +// get_buffer2 & get_buffer_context
+ +void av_rpi_zc_uninit(struct AVCodecContext * const s);
+ +
+++
+++
+++static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
+++{
+++    return frame->linesize[3];
+++}
+++
+++static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    const unsigned int stride1 = frame->linesize[0];
+++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
+++    const unsigned int x1 = x & (stride1 - 1);
+++    const unsigned int x2 = x ^ x1;
+++
+++    return x1 + stride1 * y + stride2 * x2;
+++}
+++
+++static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+++{
+++    const unsigned int stride1 = frame->linesize[0];
+++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
+++    const unsigned int x = x_c * 2;
+++    const unsigned int x1 = x & (stride1 - 1);
+++    const unsigned int x2 = x ^ x1;
+++
+++    return x1 + stride1 * y_c + stride2 * x2;
+++}
+++
+++static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
+++}
+++
+++static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
+++}
+++
+++static inline int rpi_sliced_frame(const AVFrame * const frame)
+++{
+++    return frame->format == AV_PIX_FMT_SAND128;
+++}
+++
+++
+ +#endif
+ +
+ diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+-index f7adb52..708526e 100644
++index f7adb52..3b398a3 100644
+ --- a/libavcodec/utils.c
+ +++ b/libavcodec/utils.c
+ @@ -26,6 +26,12 @@
+@@ -15547,6 +18624,30 @@ index f7adb52..708526e 100644
+                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                       CONFIG_MEMORY_POISONING ?
+                                                          NULL :
++@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
++ {
++     int ret;
++ 
+++#ifdef RPI
+++    // This is going to end badly if we let it continue
+++    av_assert0(frame->format != AV_PIX_FMT_SAND128);
+++#endif
+++
++     if ((ret = update_frame_pool(avctx, frame)) < 0)
++         return ret;
++ 
++diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
++index 21f8d9e..71ce7b9 100644
++--- a/libavfilter/avfilter.c
+++++ b/libavfilter/avfilter.c
++@@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
++                    "options, but options were provided: %s.\n", args);
++             return AVERROR(EINVAL);
++         }
+++        printf("=== args='%s'\n", args);
++ 
++ #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
++             if (   !strcmp(filter->filter->name, "format")     ||
+ diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
+ index b31d233..2767306 100644
+ --- a/libavformat/mpegts.c
+@@ -15601,6 +18702,88 @@ index 0c0ce12..82e0bc3 100644
+  /**
+   * @}
+   */
++diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
++index 0dffa4d..5644176 100644
++--- a/libavutil/pixdesc.c
+++++ b/libavutil/pixdesc.c
++@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
++         .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
++                  AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
++     },
+++    [AV_PIX_FMT_SAND128] = {
+++        .name = "sand128",
+++        .nb_components = 3,
+++        .log2_chroma_w = 1,
+++        .log2_chroma_h = 1,
+++        .comp = {
+++            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
+++            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
+++            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
+++        },
+++        .flags = 0,
+++    }
++ };
++ #if FF_API_PLUS1_MINUS1
++ FF_ENABLE_DEPRECATION_WARNINGS
++diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
++index 0ed01c4..4705e80 100644
++--- a/libavutil/pixfmt.h
+++++ b/libavutil/pixfmt.h
++@@ -303,7 +303,10 @@ enum AVPixelFormat {
++     AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
++     AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian
++ 
++-    AV_PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+++// RPI - not on ifdef so can be got at by calling progs
+++    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+++
+++    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
++ };
++ 
++ #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
++diff --git a/libswscale/input.c b/libswscale/input.c
++index 14ab5ab..e61b67a 100644
++--- a/libswscale/input.c
+++++ b/libswscale/input.c
++@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
++     }
++ }
++ 
+++
+++static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
+++                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+++                       int width, uint32_t *unused)
+++{
+++    // NIF
+++}
+++
++ #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
++ 
++ static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
++@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
++     case AV_PIX_FMT_P010BE:
++         c->chrToYV12 = p010BEToUV_c;
++         break;
+++    case AV_PIX_FMT_SAND128:
+++        c->chrToYV12 = sand128ToUV_c;
+++        break;
++     }
++     if (c->chrSrcHSubSample) {
++         switch (srcFormat) {
++diff --git a/libswscale/utils.c b/libswscale/utils.c
++index 576d8f0..d7206cc 100644
++--- a/libswscale/utils.c
+++++ b/libswscale/utils.c
++@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
++     [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
++     [AV_PIX_FMT_P010LE]      = { 1, 0 },
++     [AV_PIX_FMT_P010BE]      = { 1, 0 },
+++#ifdef RPI
+++    [AV_PIX_FMT_SAND128]     = { 1, 0 },
+++#endif
++ };
++ 
++ int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
+ diff --git a/pi-util/conf.sh b/pi-util/conf.sh
+ new file mode 100755
+ index 0000000..8b596a2
+@@ -15640,21 +18823,61 @@ index 0000000..8b596a2
+ +
+ +# gcc option for getting asm listing
+ +# -Wa,-ahls
++diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh
++new file mode 100644
++index 0000000..160e149
++--- /dev/null
+++++ b/pi-util/conf1.sh
++@@ -0,0 +1,34 @@
+++echo "Configure for Pi1"
+++
+++RPI_BUILDROOT=`pwd`/build
+++RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+++RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+++RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+++#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+++#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+++RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+++#RPI_KEEPS="-save-temps=obj"
+++RPI_KEEPS=""
+++
+++./configure --enable-cross-compile\
+++ --cpu=arm1176jzf-s\
+++ --arch=armv\
+++ --disable-neon\
+++ --target-os=linux\
+++ --disable-stripping\
+++ --enable-mmal\
+++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++
+++
+++# --enable-extra-warnings\
+++# --arch=armv71\
+++# --enable-shared\
+++
+++# gcc option for getting asm listing
+++# -Wa,-ahls
+ diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+ new file mode 100644
+-index 0000000..61d1399
++index 0000000..fc14f2a
+ --- /dev/null
+ +++ b/pi-util/conf_h265.csv
+ @@ -0,0 +1,144 @@
+ +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+-+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
+ +1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
+ +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+ +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+ +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+ +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+ +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+-+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
+ +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+ +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+ +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+@@ -15676,7 +18899,7 @@ index 0000000..61d1399
+ +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+ +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+ +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+-+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+ +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+ +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+ +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+@@ -15716,7 +18939,7 @@ index 0000000..61d1399
+ +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+ +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+ +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+-+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+ +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+ +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+ +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+@@ -15730,10 +18953,10 @@ index 0000000..61d1399
+ +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+ +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+ +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+-+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+ +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+ +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+-+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+ +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+ +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+ +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+@@ -15762,7 +18985,7 @@ index 0000000..61d1399
+ +1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
+ +1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
+ +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+-+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+ +1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
+ +1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
+ +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+@@ -15771,9 +18994,9 @@ index 0000000..61d1399
+ +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+ +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+ +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+-+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
+++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
+ +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+-+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+ +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+ +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+ +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+@@ -15792,10 +19015,10 @@ index 0000000..61d1399
+ +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+ diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+ new file mode 100644
+-index 0000000..38f942f
++index 0000000..c896bc6
+ --- /dev/null
+ +++ b/pi-util/ffconf.py
+-@@ -0,0 +1,146 @@
++@@ -0,0 +1,154 @@
+ +#!/usr/bin/env python
+ +
+ +import os
+@@ -15839,16 +19062,18 @@ index 0000000..38f942f
+ +    except:
+ +        pass
+ +
+-+    rv = False
+ +    if  m1 and m2 and m1.group() == m2.group():
+ +        print >> flog, "Match: " + m1.group()
+-+        rv = True
+++        rv = 0
+ +    elif not m1:
+ +        print >> flog, "****** Cannot find m1"
+++        rv = 3
+ +    elif not m2:
+ +        print >> flog, "****** Cannot find m2"
+++        rv = 2
+ +    else:
+ +        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
+++        rv = 1
+ +    flog.close()
+ +    return rv
+ +
+@@ -15894,19 +19119,25 @@ index 0000000..38f942f
+ +            print "==== ", name,
+ +            sys.stdout.flush()
+ +
+-+            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
+-+                if exp_test == 1:
+-+                    failures.append(name)
+-+                    print ": * FAIL *"
+-+                else:
+-+                    print ": fail"
+-+            else:
+++            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
+++            if (rv == 0):
+ +                if exp_test == 2:
+ +                    print ": * OK *"
+ +                    unx_success.append(name)
+ +                else:
+ +                    print ": ok"
+-+
+++            elif exp_test > 1 and rv == 1:
+++                print ": fail"
+++            else:
+++                failures.append(name)
+++                if rv == 1:
+++                    print ": * FAIL *"
+++                elif (rv == 2) :
+++                    print ": * CRASH *"
+++                elif (rv == 3) :
+++                    print ": * MD5 MISSING *"
+++                else :
+++                    print ": * BANG *"
+ +
+ +    if failures or unx_success:
+ +        print "Unexpected Failures:", failures
+@@ -18450,6 +21681,21 @@ index 0000000..1eacc04
+ +
+ +if __name__ == '__main__':
+ +   main()
++diff --git a/pi-util/qem.sh b/pi-util/qem.sh
++new file mode 100644
++index 0000000..47dd071
++--- /dev/null
+++++ b/pi-util/qem.sh
++@@ -0,0 +1,9 @@
+++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+++QASM=python\ pi-util/qasm.py
+++SRC_FILE=libavcodec/rpi_shader.qasm
+++DST_BASE=shader
+++
+++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
+++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+++
+ diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
+ new file mode 100755
+ index 0000000..6a9a33f
+@@ -18542,3 +21788,137 @@ index 0000000..d8bdd91
+ +pi-util/rebase_liblinks.py $DST
+ +
+ +
++diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
++new file mode 100644
++index 0000000..5935a11
++--- /dev/null
+++++ b/pi-util/v3dusage.py
++@@ -0,0 +1,128 @@
+++#!/usr/bin/env python
+++
+++import sys
+++import argparse
+++import re
+++
+++def do_logparse(logname):
+++
+++    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
+++    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
+++    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
+++    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
+++
+++    ttotal = {'idle':0.0}
+++    tstart = {}
+++    qctotal = {}
+++    qtstotal = {}
+++    l2hits = {}
+++    l2total = {}
+++    time0 = None
+++    idle_start = None
+++    qpu_op_no = 0
+++    op_count = 0
+++
+++    with open(logname, "rt") as infile:
+++        for line in infile:
+++            match = rmatch.match(line)
+++            if match:
+++#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
+++                time = float(match.group(1))
+++                unit = match.group(3)
+++                opstart = not match.group(2)
+++                optype = match.group(7)
+++                hascb = match.group(8) != "0"
+++
+++                if unit == 'qpu1':
+++                    unit = unit + "." + str(qpu_op_no)
+++                    if not opstart:
+++                        if hascb or optype == 'EXECUTE_SYNC':
+++                            qpu_op_no = 0
+++                        else:
+++                            qpu_op_no += 1
+++
+++                # Ignore sync type
+++                if optype == 'EXECUTE_SYNC':
+++                    continue
+++
+++                if not time0:
+++                    time0 = time
+++
+++                if opstart:
+++                    tstart[unit] = time;
+++                elif unit in tstart:
+++                    op_count += 1
+++                    if not unit in ttotal:
+++                        ttotal[unit] = 0.0
+++                    ttotal[unit] += time - tstart[unit]
+++                    del tstart[unit]
+++
+++                if not idle_start and not tstart:
+++                    idle_start = time
+++                elif idle_start and tstart:
+++                    ttotal['idle'] += time - idle_start
+++                    idle_start = None
+++
+++            match = rqcycle.match(line)
+++            if match:
+++                unit = "qpu1." + str(qpu_op_no)
+++                if not unit in qctotal:
+++                    qctotal[unit] = 0
+++                qctotal[unit] += int(match.group(2))
+++
+++            match = rqtscycle.match(line)
+++            if match:
+++                unit = "qpu1." + str(qpu_op_no)
+++                if not unit in qtstotal:
+++                    qtstotal[unit] = 0
+++                qtstotal[unit] += int(match.group(2))
+++
+++            match = rl2hits.match(line)
+++            if match:
+++                unit = "qpu1." + str(qpu_op_no)
+++                if not unit in l2total:
+++                    l2total[unit] = 0
+++                    l2hits[unit] = 0
+++                l2total[unit] += int(match.group(3))
+++                if match.group(2) == "hits":
+++                    l2hits[unit] += int(match.group(3))
+++
+++
+++    if not time0:
+++        print "No v3d profile records found"
+++    else:
+++        tlogged = time - time0
+++
+++        print "Logged time:", tlogged, "  Op count:", op_count
+++        for unit in sorted(ttotal):
+++            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
+++        print
+++        for unit in sorted(qctotal):
+++            if not unit in qtstotal:
+++                qtstotal[unit] = 0;
+++            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
+++            if unit in l2total:
+++                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
+++
+++
+++
+++if __name__ == '__main__':
+++    argp = argparse.ArgumentParser(
+++        formatter_class=argparse.RawDescriptionHelpFormatter,
+++        description="QPU/VPU perf summary from VC logging",
+++        epilog = """
+++Will also summarise TMU stalls if logging requests set in qpu noflush param
+++in the profiled code.
+++
+++Example use:
+++  vcgencmd set_logging level=0xc0
+++  <command to profile>
+++  sudo vcdbg log msg >& t.log
+++  v3dusage.py t.log
+++""")
+++
+++    argp.add_argument("logfile")
+++    args = argp.parse_args()
+++
+++    do_logparse(args.logfile)
+++